0-hero
/

gpt2-pos-encoding-experiment-10B

Model card Files Files and versions Community

0-hero commited on Sep 27, 2024

Commit

485133c

verified ·

1 Parent(s): d742687

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.cache/pip/http-v2/0/1/f/2/0/01f2082df50502ba9492d64e69db99d1fdb5730707a16c6264b355b8.body +3 -0
.cache/pip/http-v2/0/2/d/4/2/02d4221e858694abc22129c65515f1df2c4c326330eb1a34ceb0b382.body +3 -0
.cache/pip/http-v2/4/0/2/3/b/4023be7b5b37a7a4144c804ce69828082d4fb2a124d9d8aabc855da8.body +3 -0
.cache/pip/http-v2/4/f/d/2/5/4fd254dbd56deb4021e55d22c4b489f6c776c69c316eb7345bc91691.body +3 -0
.cache/pip/http-v2/9/6/e/8/3/96e83221dd149da9a3d38feebc955beb2034effd910108971c5b167b.body +3 -0
.cache/pip/http-v2/9/e/8/c/8/9e8c8c0496d6d3384d616902379ed05e07b6b1dba9673d70b5fef231.body +3 -0
.cache/pip/http-v2/a/e/7/a/2/ae7a241673cf118ca18eca030dc29d2715b1980127dd0e2949514433.body +3 -0
.cache/pip/http-v2/d/3/3/a/b/d33abf9ad709d023fff05902f39da682c1afb233bcd9f2c479487586.body +3 -0
.cache/pip/http-v2/d/b/1/f/6/db1f6b45c0850c8e2ce7d8b47148edeca6e8115413af41f4ecc8ce32.body +3 -0
.cache/pip/http-v2/f/5/2/7/6/f52769e4b4d00542e1e056baf2db3e5ad8f277bff67f2636cace711d.body +3 -0
.cache/pip/wheels/7e/e3/c3/89c7a2f3c4adc07cd1c675f8bb7b9ad4d18f64a72bccdfe826/flash_attn-2.6.3-cp310-cp310-linux_x86_64.whl +3 -0
.gitattributes +20 -0
.launchpadlib/api.launchpad.net/cache/api.launchpad.net,devel,-application,vnd.sun.wadl+xml,2f09acb494bdefdbf8ef0d1396a05e86 +0 -0
.local/share/jupyter/nbextensions/go_to_current_running_cell/auto_focus.gif +3 -0
.local/share/jupyter/nbextensions/nbTranslate/demo1.gif +3 -0
.local/share/jupyter/nbextensions/nbTranslate/demo2.gif +3 -0
.local/share/jupyter/nbextensions/scratchpad/demo.gif +3 -0
.local/share/jupyter/nbextensions/toc2/demo.gif +3 -0
.local/share/jupyter/nbextensions/zenmode/images/back11.jpg +0 -0
.local/share/jupyter/nbextensions/zenmode/images/back21.jpg +0 -0
.local/share/jupyter/nbextensions/zenmode/images/ipynblogo1.png +0 -0
.triton/dump/11759acf26ac56366b171628132485d6/triton_.llir +347 -0
.triton/dump/11759acf26ac56366b171628132485d6/triton_.ttgir +78 -0
.triton/dump/11759acf26ac56366b171628132485d6/triton_.ttir +76 -0
.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ptx +988 -0
.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ttir +104 -0
.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.ptx +782 -0
.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.ttgir +88 -0
.triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.llir +980 -0
.triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.ptx +1654 -0
.triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.ttir +104 -0
.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.ptx +465 -0
.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.ttgir +39 -0
.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.ttir +38 -0
.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.cubin +0 -0
.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.llir +550 -0
.triton/dump/4c6ad48573c74d55ed79384f6b432d50/triton_.cubin +0 -0
.triton/dump/4c6ad48573c74d55ed79384f6b432d50/triton_.llir +85 -0
.triton/dump/510522bb05917b836ed253751364fcad/triton_.cubin +0 -0
.triton/dump/510522bb05917b836ed253751364fcad/triton_.ptx +1810 -0
.triton/dump/510522bb05917b836ed253751364fcad/triton_.ttgir +153 -0
.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.cubin +0 -0
.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.llir +1360 -0
.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.ptx +2004 -0
.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.ttgir +164 -0
.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.ttir +151 -0
.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.ttgir +169 -0
.triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.ttgir +110 -0
.triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.ttir +91 -0
.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.ttgir +68 -0

.cache/pip/http-v2/0/1/f/2/0/01f2082df50502ba9492d64e69db99d1fdb5730707a16c6264b355b8.body ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d4511c52caacf3c4981d1ae2df85908bd31853f33d30b345c8b6830763f769c
+size 1080866

.cache/pip/http-v2/0/2/d/4/2/02d4221e858694abc22129c65515f1df2c4c326330eb1a34ceb0b382.body ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5bfae9500ad8e7d2937ebccb4906f3bc464d1bf66eedd0e4adabd520811c7b52
+size 2631958

.cache/pip/http-v2/4/0/2/3/b/4023be7b5b37a7a4144c804ce69828082d4fb2a124d9d8aabc855da8.body ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd4c97d69242efd604c1a2077c8b56341e236cfaca78c40f59dcef9b95464fdc
+size 9663908

.cache/pip/http-v2/4/f/d/2/5/4fd254dbd56deb4021e55d22c4b489f6c776c69c316eb7345bc91691.body ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f99e4769b4750076cd4235c044b61232110733322384a94a63791d2e7beacc66
+size 9995162

.cache/pip/http-v2/9/6/e/8/3/96e83221dd149da9a3d38feebc955beb2034effd910108971c5b167b.body ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e32dced201274bf96899e6491d9ba3e9a5f6b336708656466ad0522d8528f69
+size 41178528

.cache/pip/http-v2/9/e/8/c/8/9e8c8c0496d6d3384d616902379ed05e07b6b1dba9673d70b5fef231.body ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d2665c5df629eb2f981dab244c01bfa6cdc185f4ffa026639286c4d56fafb54
+size 1221827

.cache/pip/http-v2/a/e/7/a/2/ae7a241673cf118ca18eca030dc29d2715b1980127dd0e2949514433.body ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9dca7c3956b03b7663fac4d150f5e6d4f6f38b2462c1e9afd83bcf7019f17913
+size 1080679

.cache/pip/http-v2/d/3/3/a/b/d33abf9ad709d023fff05902f39da682c1afb233bcd9f2c479487586.body ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba0d021a166865d2265246961bec0152ff124de910c5cc39f1156ce3fa7c69dc
+size 2110226

.cache/pip/http-v2/d/b/1/f/6/db1f6b45c0850c8e2ce7d8b47148edeca6e8115413af41f4ecc8ce32.body ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047
+size 39855626

.cache/pip/http-v2/f/5/2/7/6/f52769e4b4d00542e1e056baf2db3e5ad8f277bff67f2636cace711d.body ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57
+size 13064210

.cache/pip/wheels/7e/e3/c3/89c7a2f3c4adc07cd1c675f8bb7b9ad4d18f64a72bccdfe826/flash_attn-2.6.3-cp310-cp310-linux_x86_64.whl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8991eedb5038a1ee6fc9904f99c12b40213d66753ed91e261a43d085f5aeab8f
+size 187219571

.gitattributes CHANGED Viewed

@@ -33,3 +33,23 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+.cache/pip/http-v2/a/e/7/a/2/ae7a241673cf118ca18eca030dc29d2715b1980127dd0e2949514433.body filter=lfs diff=lfs merge=lfs -text
+.cache/pip/http-v2/0/2/d/4/2/02d4221e858694abc22129c65515f1df2c4c326330eb1a34ceb0b382.body filter=lfs diff=lfs merge=lfs -text
+.local/share/jupyter/nbextensions/nbTranslate/demo2.gif filter=lfs diff=lfs merge=lfs -text
+.cache/pip/http-v2/4/f/d/2/5/4fd254dbd56deb4021e55d22c4b489f6c776c69c316eb7345bc91691.body filter=lfs diff=lfs merge=lfs -text
+.cache/pip/http-v2/0/1/f/2/0/01f2082df50502ba9492d64e69db99d1fdb5730707a16c6264b355b8.body filter=lfs diff=lfs merge=lfs -text
+.cache/pip/http-v2/4/0/2/3/b/4023be7b5b37a7a4144c804ce69828082d4fb2a124d9d8aabc855da8.body filter=lfs diff=lfs merge=lfs -text
+.local/share/jupyter/nbextensions/scratchpad/demo.gif filter=lfs diff=lfs merge=lfs -text
+.cache/pip/http-v2/f/5/2/7/6/f52769e4b4d00542e1e056baf2db3e5ad8f277bff67f2636cace711d.body filter=lfs diff=lfs merge=lfs -text
+.local/share/jupyter/nbextensions/go_to_current_running_cell/auto_focus.gif filter=lfs diff=lfs merge=lfs -text
+.cache/pip/http-v2/9/e/8/c/8/9e8c8c0496d6d3384d616902379ed05e07b6b1dba9673d70b5fef231.body filter=lfs diff=lfs merge=lfs -text
+.local/share/jupyter/nbextensions/nbTranslate/demo1.gif filter=lfs diff=lfs merge=lfs -text
+.local/share/jupyter/nbextensions/toc2/demo.gif filter=lfs diff=lfs merge=lfs -text
+.cache/pip/http-v2/d/3/3/a/b/d33abf9ad709d023fff05902f39da682c1afb233bcd9f2c479487586.body filter=lfs diff=lfs merge=lfs -text
+wandb/run-20240927_074011-gzu8f7wl/run-gzu8f7wl.wandb filter=lfs diff=lfs merge=lfs -text
+.cache/pip/http-v2/9/6/e/8/3/96e83221dd149da9a3d38feebc955beb2034effd910108971c5b167b.body filter=lfs diff=lfs merge=lfs -text
+wandb/run-20240927_005424-60260ulk/run-60260ulk.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20240926_180814-1klxtkie/run-1klxtkie.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20240926_111257-7yvor5gh/run-7yvor5gh.wandb filter=lfs diff=lfs merge=lfs -text
+.cache/pip/http-v2/d/b/1/f/6/db1f6b45c0850c8e2ce7d8b47148edeca6e8115413af41f4ecc8ce32.body filter=lfs diff=lfs merge=lfs -text
+.cache/pip/wheels/7e/e3/c3/89c7a2f3c4adc07cd1c675f8bb7b9ad4d18f64a72bccdfe826/flash_attn-2.6.3-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text

.launchpadlib/api.launchpad.net/cache/api.launchpad.net,devel,-application,vnd.sun.wadl+xml,2f09acb494bdefdbf8ef0d1396a05e86 ADDED Viewed

The diff for this file is too large to render. See raw diff

.local/share/jupyter/nbextensions/go_to_current_running_cell/auto_focus.gif ADDED Viewed

Git LFS Details

SHA256: 3dc033a545fe3eccdeee6e66932f1a46de4d0cafe084d471165e750ede1dcc4f
Pointer size: 132 Bytes
Size of remote file: 1.75 MB

.local/share/jupyter/nbextensions/nbTranslate/demo1.gif ADDED Viewed

Git LFS Details

SHA256: d5a8c90d8375a3a4fb62dee376de91281b96c7616d3763eeaba9cc1d4f9c1f9c
Pointer size: 132 Bytes
Size of remote file: 2.34 MB

.local/share/jupyter/nbextensions/nbTranslate/demo2.gif ADDED Viewed

Git LFS Details

SHA256: c27adeaf2ba905566502c3f319f1d01e59a3683c77392d890c24a4532396884c
Pointer size: 132 Bytes
Size of remote file: 3.1 MB

.local/share/jupyter/nbextensions/scratchpad/demo.gif ADDED Viewed

Git LFS Details

SHA256: cfbc6359d32c4b072feea49dca4880a75c6f49dd54c6d14225658f73e0d3ae27
Pointer size: 132 Bytes
Size of remote file: 1.16 MB

.local/share/jupyter/nbextensions/toc2/demo.gif ADDED Viewed

Git LFS Details

SHA256: 19189620710630c6073bc9fb06464e6b3b7faa85d09ea52fa70d803d9cfba587
Pointer size: 132 Bytes
Size of remote file: 3.49 MB

.local/share/jupyter/nbextensions/zenmode/images/back11.jpg ADDED Viewed

.local/share/jupyter/nbextensions/zenmode/images/back21.jpg ADDED Viewed

.local/share/jupyter/nbextensions/zenmode/images/ipynblogo1.png ADDED Viewed

.triton/dump/11759acf26ac56366b171628132485d6/triton_.llir ADDED Viewed

	@@ -0,0 +1,347 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+@global_smem = external addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+define void @triton__0d1d2d3d4d5d6d7d8de9de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, i32 %8, i32 %9) local_unnamed_addr !dbg !7 {
+  %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %12 = and i32 %11, 31, !dbg !10
+  %13 = lshr i32 %11, 5, !dbg !10
+  %14 = and i32 %13, 1, !dbg !10
+  %urem = and i32 %11, 63, !dbg !10
+  %15 = shl nuw nsw i32 %urem, 2, !dbg !10
+  %16 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
+  %17 = shl i32 %16, 8, !dbg !12
+  %18 = or i32 %17, %15, !dbg !13
+  %19 = sext i32 %18 to i64, !dbg !14
+  %20 = getelementptr float, ptr addrspace(1) %1, i64 %19, !dbg !14
+  %21 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %20, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
+  %22 = extractvalue { i32, i32, i32, i32 } %21, 0, !dbg !15
+  %23 = extractvalue { i32, i32, i32, i32 } %21, 1, !dbg !15
+  %24 = extractvalue { i32, i32, i32, i32 } %21, 2, !dbg !15
+  %25 = extractvalue { i32, i32, i32, i32 } %21, 3, !dbg !15
+  %26 = bitcast i32 %24 to float, !dbg !15
+  %27 = bitcast i32 %25 to float, !dbg !15
+  %28 = getelementptr i16, ptr addrspace(1) %2, i64 %19, !dbg !16
+  %29 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %28, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
+  %30 = extractvalue { i32, i32 } %29, 0, !dbg !17
+  %31 = extractvalue { i32, i32 } %29, 1, !dbg !17
+  %32 = trunc i32 %30 to i16, !dbg !17
+  %extelt.offset = lshr i32 %30, 16, !dbg !17
+  %33 = trunc i32 %extelt.offset to i16, !dbg !17
+  %34 = trunc i32 %31 to i16, !dbg !17
+  %extelt.offset1 = lshr i32 %31, 16, !dbg !17
+  %35 = trunc i32 %extelt.offset1 to i16, !dbg !17
+  %36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18
+  %37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %33) #6, !dbg !18
+  %38 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %34) #6, !dbg !18
+  %39 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %35) #6, !dbg !18
+  %40 = getelementptr i16, ptr addrspace(1) %3, i64 %19, !dbg !19
+  %41 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %40, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
+  %42 = extractvalue { i32, i32 } %41, 0, !dbg !20
+  %43 = extractvalue { i32, i32 } %41, 1, !dbg !20
+  %44 = trunc i32 %42 to i16, !dbg !20
+  %extelt.offset2 = lshr i32 %42, 16, !dbg !20
+  %45 = trunc i32 %extelt.offset2 to i16, !dbg !20
+  %46 = trunc i32 %43 to i16, !dbg !20
+  %extelt.offset3 = lshr i32 %43, 16, !dbg !20
+  %47 = trunc i32 %extelt.offset3 to i16, !dbg !20
+  %48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #6, !dbg !21
+  %49 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %45) #6, !dbg !21
+  %50 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %46) #6, !dbg !21
+  %51 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %47) #6, !dbg !21
+  %52 = getelementptr i16, ptr addrspace(1) %4, i64 %19, !dbg !22
+  %53 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %52, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23
+  %54 = extractvalue { i32, i32 } %53, 0, !dbg !23
+  %55 = extractvalue { i32, i32 } %53, 1, !dbg !23
+  %56 = trunc i32 %54 to i16, !dbg !23
+  %extelt.offset4 = lshr i32 %54, 16, !dbg !23
+  %57 = trunc i32 %extelt.offset4 to i16, !dbg !23
+  %58 = trunc i32 %55 to i16, !dbg !23
+  %extelt.offset5 = lshr i32 %55, 16, !dbg !23
+  %59 = trunc i32 %extelt.offset5 to i16, !dbg !23
+  %60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %56) #6, !dbg !24
+  %61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %57) #6, !dbg !24
+  %62 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %58) #6, !dbg !24
+  %63 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %59) #6, !dbg !24
+  %64 = zext nneg i32 %15 to i64, !dbg !25
+  %65 = getelementptr float, ptr addrspace(1) %5, i64 %64, !dbg !25
+  %66 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %65, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !26
+  %67 = fadd float %38, %26, !dbg !27
+  %68 = fadd float %39, %27, !dbg !27
+  %69 = fadd float %67, %50, !dbg !28
+  %70 = fadd float %68, %51, !dbg !28
+  %71 = insertelement <2 x i32> poison, i32 %22, i64 0, !dbg !15
+  %72 = insertelement <2 x i32> %71, i32 %23, i64 1, !dbg !15
+  %73 = bitcast <2 x i32> %72 to <2 x float>, !dbg !15
+  %74 = insertelement <2 x float> poison, float %36, i64 0, !dbg !27
+  %75 = insertelement <2 x float> %74, float %37, i64 1, !dbg !27
+  %76 = fadd <2 x float> %75, %73, !dbg !27
+  %77 = insertelement <2 x float> poison, float %48, i64 0, !dbg !28
+  %78 = insertelement <2 x float> %77, float %49, i64 1, !dbg !28
+  %79 = fadd <2 x float> %76, %78, !dbg !28
+  %80 = insertelement <2 x float> poison, float %60, i64 0, !dbg !29
+  %81 = insertelement <2 x float> %80, float %61, i64 1, !dbg !29
+  %82 = fadd <2 x float> %79, %81, !dbg !29
+  %83 = fadd float %69, %62, !dbg !29
+  %84 = fadd float %70, %63, !dbg !29
+  %85 = extractelement <2 x float> %82, i64 0, !dbg !30
+  %86 = extractelement <2 x float> %82, i64 1, !dbg !30
+  %87 = fadd float %85, %86, !dbg !30
+  %88 = fadd float %87, %83, !dbg !30
+  %89 = fadd float %88, %84, !dbg !30
+  %90 = bitcast float %89 to i32, !dbg !36
+  %91 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %90, i32 16, i32 31), !dbg !36
+  %92 = bitcast i32 %91 to float, !dbg !36
+  %93 = fadd float %89, %92, !dbg !30
+  %94 = bitcast float %93 to i32, !dbg !36
+  %95 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %94, i32 8, i32 31), !dbg !36
+  %96 = bitcast i32 %95 to float, !dbg !36
+  %97 = fadd float %93, %96, !dbg !30
+  %98 = bitcast float %97 to i32, !dbg !36
+  %99 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %98, i32 4, i32 31), !dbg !36
+  %100 = bitcast i32 %99 to float, !dbg !36
+  %101 = fadd float %97, %100, !dbg !30
+  %102 = bitcast float %101 to i32, !dbg !36
+  %103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 2, i32 31), !dbg !36
+  %104 = bitcast i32 %103 to float, !dbg !36
+  %105 = fadd float %101, %104, !dbg !30
+  %106 = bitcast float %105 to i32, !dbg !36
+  %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 1, i32 31), !dbg !36
+  %108 = bitcast i32 %107 to float, !dbg !36
+  %109 = fadd float %105, %108, !dbg !30
+  %110 = icmp eq i32 %12, 0, !dbg !36
+  %111 = zext nneg i32 %14 to i64, !dbg !36
+  %112 = getelementptr float, ptr addrspace(3) @global_smem, i64 %111, !dbg !36
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %112, float %109, i1 %110) #6, !dbg !36
+  tail call void @llvm.nvvm.barrier0(), !dbg !36
+  %113 = icmp slt i32 %11, 2, !dbg !36
+  %114 = sext i32 %11 to i64, !dbg !36
+  %115 = getelementptr float, ptr addrspace(3) @global_smem, i64 %114, !dbg !36
+  %116 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %115, i1 %113) #6, !dbg !36
+  %117 = bitcast float %116 to i32, !dbg !36
+  %118 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %117, i32 1, i32 31), !dbg !36
+  %119 = bitcast i32 %118 to float, !dbg !36
+  %120 = fadd float %116, %119, !dbg !30
+  %121 = and i32 %11, 1, !dbg !36
+  %122 = icmp eq i32 %121, 0, !dbg !36
+  %123 = and i1 %113, %122, !dbg !36
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %115, float %120, i1 %123) #6, !dbg !36
+  tail call void @llvm.nvvm.barrier0(), !dbg !36
+  %124 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !36
+  %125 = fadd float %124, 0.000000e+00, !dbg !38
+  %126 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %125, float 2.560000e+02) #6, !dbg !42
+  %127 = fsub float %85, %126, !dbg !43
+  %128 = fsub float %86, %126, !dbg !43
+  %129 = fsub float %83, %126, !dbg !43
+  %130 = fsub float %84, %126, !dbg !43
+  %131 = fmul float %127, %127, !dbg !44
+  %132 = fmul float %128, %128, !dbg !44
+  %133 = fmul float %129, %129, !dbg !44
+  %134 = fmul float %130, %130, !dbg !44
+  tail call void @llvm.nvvm.barrier0(), !dbg !45
+  %135 = fadd float %131, %132, !dbg !47
+  %136 = fadd float %133, %135, !dbg !47
+  %137 = fadd float %134, %136, !dbg !47
+  %138 = bitcast float %137 to i32, !dbg !45
+  %139 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %138, i32 16, i32 31), !dbg !45
+  %140 = bitcast i32 %139 to float, !dbg !45
+  %141 = fadd float %137, %140, !dbg !47
+  %142 = bitcast float %141 to i32, !dbg !45
+  %143 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %142, i32 8, i32 31), !dbg !45
+  %144 = bitcast i32 %143 to float, !dbg !45
+  %145 = fadd float %141, %144, !dbg !47
+  %146 = bitcast float %145 to i32, !dbg !45
+  %147 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %146, i32 4, i32 31), !dbg !45
+  %148 = bitcast i32 %147 to float, !dbg !45
+  %149 = fadd float %145, %148, !dbg !47
+  %150 = bitcast float %149 to i32, !dbg !45
+  %151 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %150, i32 2, i32 31), !dbg !45
+  %152 = bitcast i32 %151 to float, !dbg !45
+  %153 = fadd float %149, %152, !dbg !47
+  %154 = bitcast float %153 to i32, !dbg !45
+  %155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 1, i32 31), !dbg !45
+  %156 = bitcast i32 %155 to float, !dbg !45
+  %157 = fadd float %153, %156, !dbg !47
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %112, float %157, i1 %110) #6, !dbg !45
+  tail call void @llvm.nvvm.barrier0(), !dbg !45
+  %158 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %115, i1 %113) #6, !dbg !45
+  %159 = bitcast float %158 to i32, !dbg !45
+  %160 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %159, i32 1, i32 31), !dbg !45
+  %161 = bitcast i32 %160 to float, !dbg !45
+  %162 = fadd float %158, %161, !dbg !47
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %115, float %162, i1 %123) #6, !dbg !45
+  tail call void @llvm.nvvm.barrier0(), !dbg !45
+  %163 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !45
+  %164 = fadd float %163, 0.000000e+00, !dbg !50
+  %165 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %164, float 2.560000e+02) #6, !dbg !52
+  %166 = fadd float %165, 0x3EE4F8B580000000, !dbg !53
+  %167 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !54
+  %.not.i = icmp eq i32 %167, 0, !dbg !54
+  br i1 %.not.i, label %170, label %168, !dbg !54
+168:                                              ; preds = %10
+  %169 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %166), !dbg !54
+  br label %__nv_rsqrtf.exit, !dbg !54
+170:                                              ; preds = %10
+  %171 = tail call float @llvm.nvvm.rsqrt.approx.f(float %166), !dbg !54
+  br label %__nv_rsqrtf.exit, !dbg !54
+__nv_rsqrtf.exit:                                 ; preds = %168, %170
+  %.0.i = phi float [ %169, %168 ], [ %171, %170 ], !dbg !54
+  %172 = extractvalue { i32, i32, i32, i32 } %66, 3, !dbg !26
+  %173 = bitcast i32 %172 to float, !dbg !26
+  %174 = extractvalue { i32, i32, i32, i32 } %66, 2, !dbg !26
+  %175 = bitcast i32 %174 to float, !dbg !26
+  %176 = extractvalue { i32, i32, i32, i32 } %66, 1, !dbg !26
+  %177 = bitcast i32 %176 to float, !dbg !26
+  %178 = extractvalue { i32, i32, i32, i32 } %66, 0, !dbg !26
+  %179 = bitcast i32 %178 to float, !dbg !26
+  %180 = fmul float %127, %.0.i, !dbg !55
+  %181 = fmul float %128, %.0.i, !dbg !55
+  %182 = fmul float %129, %.0.i, !dbg !55
+  %183 = fmul float %130, %.0.i, !dbg !55
+  %184 = fmul float %180, %179, !dbg !56
+  %185 = fmul float %181, %177, !dbg !56
+  %186 = fmul float %182, %175, !dbg !56
+  %187 = fmul float %183, %173, !dbg !56
+  tail call void @llvm.nvvm.barrier0(), !dbg !57
+  %188 = sext i32 %16 to i64, !dbg !58
+  %189 = getelementptr float, ptr addrspace(1) %0, i64 %188, !dbg !58
+  %190 = icmp eq i32 %urem, 0, !dbg !59
+  %191 = bitcast float %.0.i to i32, !dbg !59
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %191, ptr addrspace(1) %189, i1 %190) #6, !dbg !59
+  %192 = getelementptr i16, ptr addrspace(1) %7, i64 %19, !dbg !60
+  %193 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %184) #6, !dbg !61
+  %194 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %185) #6, !dbg !61
+  %195 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %186) #6, !dbg !61
+  %196 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %187) #6, !dbg !61
+  %197 = insertelement <2 x i16> undef, i16 %193, i64 0, !dbg !61
+  %198 = insertelement <2 x i16> %197, i16 %194, i64 1, !dbg !61
+  %199 = bitcast <2 x i16> %198 to i32, !dbg !61
+  %200 = insertelement <2 x i16> undef, i16 %195, i64 0, !dbg !61
+  %201 = insertelement <2 x i16> %200, i16 %196, i64 1, !dbg !61
+  %202 = bitcast <2 x i16> %201 to i32, !dbg !61
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %199, i32 %202, ptr addrspace(1) %192, i1 true) #6, !dbg !61
+  %203 = getelementptr float, ptr addrspace(1) %6, i64 %188, !dbg !62
+  %204 = bitcast float %126 to i32, !dbg !63
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %204, ptr addrspace(1) %203, i1 %190) #6, !dbg !63
+  ret void, !dbg !64
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "cdxa5yqgsimvskocpuiz4ajfrjfcwys3opyrdv53xfphj4576qx7.py", directory: "/tmp/torchinductor_root/dx")
+!4 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"maxntidx", i32 64}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8de9de", linkageName: "triton__0d1d2d3d4d5d6d7d8de9de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 26, column: 26, scope: !7)
+!11 = !DILocation(line: 23, column: 28, scope: !7)
+!12 = !DILocation(line: 30, column: 40, scope: !7)
+!13 = !DILocation(line: 30, column: 36, scope: !7)
+!14 = !DILocation(line: 30, column: 30, scope: !7)
+!15 = !DILocation(line: 30, column: 46, scope: !7)
+!16 = !DILocation(line: 31, column: 30, scope: !7)
+!17 = !DILocation(line: 31, column: 46, scope: !7)
+!18 = !DILocation(line: 31, column: 67, scope: !7)
+!19 = !DILocation(line: 32, column: 30, scope: !7)
+!20 = !DILocation(line: 32, column: 46, scope: !7)
+!21 = !DILocation(line: 32, column: 67, scope: !7)
+!22 = !DILocation(line: 33, column: 30, scope: !7)
+!23 = !DILocation(line: 33, column: 46, scope: !7)
+!24 = !DILocation(line: 33, column: 67, scope: !7)
+!25 = !DILocation(line: 34, column: 31, scope: !7)
+!26 = !DILocation(line: 34, column: 36, scope: !7)
+!27 = !DILocation(line: 36, column: 18, scope: !7)
+!28 = !DILocation(line: 38, column: 18, scope: !7)
+!29 = !DILocation(line: 40, column: 18, scope: !7)
+!30 = !DILocation(line: 233, column: 15, scope: !31, inlinedAt: !34)
+!31 = distinct !DILexicalBlockFile(scope: !33, file: !32, discriminator: 0)
+!32 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
+!33 = distinct !DILexicalBlockFile(scope: !7, file: !32, discriminator: 0)
+!34 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !35)
+!35 = !DILocation(line: 45, column: 59, scope: !31)
+!36 = !DILocation(line: 243, column: 36, scope: !33, inlinedAt: !37)
+!37 = !DILocation(line: 45, column: 59, scope: !33)
+!38 = !DILocation(line: 8, column: 15, scope: !39, inlinedAt: !41)
+!39 = distinct !DILexicalBlockFile(scope: !7, file: !40, discriminator: 0)
+!40 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!41 = !DILocation(line: 45, column: 45, scope: !39)
+!42 = !DILocation(line: 48, column: 20, scope: !7)
+!43 = !DILocation(line: 49, column: 20, scope: !7)
+!44 = !DILocation(line: 50, column: 20, scope: !7)
+!45 = !DILocation(line: 243, column: 36, scope: !33, inlinedAt: !46)
+!46 = !DILocation(line: 53, column: 59, scope: !33)
+!47 = !DILocation(line: 233, column: 15, scope: !31, inlinedAt: !48)
+!48 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !49)
+!49 = !DILocation(line: 53, column: 59, scope: !31)
+!50 = !DILocation(line: 8, column: 15, scope: !39, inlinedAt: !51)
+!51 = !DILocation(line: 53, column: 45, scope: !39)
+!52 = !DILocation(line: 55, column: 20, scope: !7)
+!53 = !DILocation(line: 57, column: 20, scope: !7)
+!54 = !DILocation(line: 58, column: 26, scope: !7)
+!55 = !DILocation(line: 60, column: 20, scope: !7)
+!56 = !DILocation(line: 61, column: 20, scope: !7)
+!57 = !DILocation(line: 63, column: 4, scope: !7)
+!58 = !DILocation(line: 64, column: 28, scope: !7)
+!59 = !DILocation(line: 64, column: 40, scope: !7)
+!60 = !DILocation(line: 65, column: 25, scope: !7)
+!61 = !DILocation(line: 65, column: 48, scope: !7)
+!62 = !DILocation(line: 66, column: 25, scope: !7)
+!63 = !DILocation(line: 66, column: 37, scope: !7)
+!64 = !DILocation(line: 66, column: 4, scope: !7)

.triton/dump/11759acf26ac56366b171628132485d6/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,78 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
+    %cst_0 = arith.constant 9.99999974E-6 : f32
+    %cst_1 = arith.constant 2.560000e+02 : f32
+    %cst_2 = arith.constant 0.000000e+00 : f32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
+    %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
+    %3 = arith.muli %0, %c256_i32 : i32
+    %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
+    %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
+    %6 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %9 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %13 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %17 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %19 = tt.load %18, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %20 = arith.extf %19 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %21 = tt.splat %arg5 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %22 = tt.addptr %21, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %23 = tt.load %22, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %24 = arith.addf %8, %12 : tensor<256xf32, #blocked>
+    %25 = arith.addf %24, %16 : tensor<256xf32, #blocked>
+    %26 = arith.addf %25, %20 : tensor<256xf32, #blocked>
+    %27 = arith.select %2, %26, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %28 = "tt.reduce"(%27) <{axis = 0 : i32}> ({
+    ^bb0(%arg10: f32, %arg11: f32):
+      %52 = arith.addf %arg10, %arg11 : f32
+      tt.reduce.return %52 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %29 = arith.addf %28, %cst_2 : f32
+    %30 = arith.divf %29, %cst_1 : f32
+    %31 = tt.splat %30 : (f32) -> tensor<1xf32, #blocked1>
+    %32 = tt.splat %30 : (f32) -> tensor<256xf32, #blocked>
+    %33 = arith.subf %26, %32 : tensor<256xf32, #blocked>
+    %34 = arith.mulf %33, %33 : tensor<256xf32, #blocked>
+    %35 = arith.select %2, %34, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %36 = "tt.reduce"(%35) <{axis = 0 : i32}> ({
+    ^bb0(%arg10: f32, %arg11: f32):
+      %52 = arith.addf %arg10, %arg11 : f32
+      tt.reduce.return %52 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %37 = arith.addf %36, %cst_2 : f32
+    %38 = arith.divf %37, %cst_1 : f32
+    %39 = arith.addf %38, %cst_0 : f32
+    %40 = tt.extern_elementwise %39 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
+    %41 = tt.splat %40 : (f32) -> tensor<1xf32, #blocked1>
+    %42 = tt.splat %40 : (f32) -> tensor<256xf32, #blocked>
+    %43 = arith.mulf %33, %42 : tensor<256xf32, #blocked>
+    %44 = arith.mulf %43, %23 : tensor<256xf32, #blocked>
+    gpu.barrier
+    %45 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
+    %46 = tt.splat %45 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
+    tt.store %46, %41 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
+    %47 = tt.splat %arg7 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %48 = tt.addptr %47, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %49 = arith.truncf %44 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
+    tt.store %48, %49, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
+    %50 = tt.addptr %arg6, %0 : !tt.ptr<f32, 1>, i32
+    %51 = tt.splat %50 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
+    tt.store %51, %31 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
+    tt.return
+  }
+}

.triton/dump/11759acf26ac56366b171628132485d6/triton_.ttir ADDED Viewed

	@@ -0,0 +1,76 @@

+module {
+  tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c256_i32 = arith.constant 256 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %cst_1 = arith.constant 2.560000e+02 : f32
+    %cst_2 = arith.constant 9.99999974E-6 : f32
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
+    %cst_4 = arith.constant dense<256> : tensor<256xi32>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
+    %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
+    %3 = arith.muli %0, %c256_i32 : i32
+    %4 = tt.splat %3 : (i32) -> tensor<256xi32>
+    %5 = arith.addi %1, %4 : tensor<256xi32>
+    %6 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
+    %9 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
+    %13 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32>
+    %17 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %19 = tt.load %18, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %20 = arith.extf %19 : tensor<256xbf16> to tensor<256xf32>
+    %21 = tt.splat %arg5 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %22 = tt.addptr %21, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    %23 = tt.load %22, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
+    %24 = arith.addf %8, %12 : tensor<256xf32>
+    %25 = arith.addf %24, %16 : tensor<256xf32>
+    %26 = arith.addf %25, %20 : tensor<256xf32>
+    %27 = arith.select %2, %26, %cst_3 : tensor<256xi1>, tensor<256xf32>
+    %28 = "tt.reduce"(%27) <{axis = 0 : i32}> ({
+    ^bb0(%arg10: f32, %arg11: f32):
+      %52 = arith.addf %arg10, %arg11 : f32
+      tt.reduce.return %52 : f32
+    }) : (tensor<256xf32>) -> f32
+    %29 = arith.addf %28, %cst_0 : f32
+    %30 = arith.divf %29, %cst_1 : f32
+    %31 = tt.splat %30 : (f32) -> tensor<1xf32>
+    %32 = tt.splat %30 : (f32) -> tensor<256xf32>
+    %33 = arith.subf %26, %32 : tensor<256xf32>
+    %34 = arith.mulf %33, %33 : tensor<256xf32>
+    %35 = arith.select %2, %34, %cst_3 : tensor<256xi1>, tensor<256xf32>
+    %36 = "tt.reduce"(%35) <{axis = 0 : i32}> ({
+    ^bb0(%arg10: f32, %arg11: f32):
+      %52 = arith.addf %arg10, %arg11 : f32
+      tt.reduce.return %52 : f32
+    }) : (tensor<256xf32>) -> f32
+    %37 = arith.addf %36, %cst_0 : f32
+    %38 = arith.divf %37, %cst_1 : f32
+    %39 = arith.addf %38, %cst_2 : f32
+    %40 = tt.extern_elementwise %39 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
+    %41 = tt.splat %40 : (f32) -> tensor<1xf32>
+    %42 = tt.splat %40 : (f32) -> tensor<256xf32>
+    %43 = arith.mulf %33, %42 : tensor<256xf32>
+    %44 = arith.mulf %43, %23 : tensor<256xf32>
+    gpu.barrier
+    %45 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
+    %46 = tt.splat %45 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
+    tt.store %46, %41 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
+    %47 = tt.splat %arg7 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %48 = tt.addptr %47, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %49 = arith.truncf %44 : tensor<256xf32> to tensor<256xbf16>
+    tt.store %48, %49, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
+    %50 = tt.addptr %arg6, %0 : !tt.ptr<f32, 1>, i32
+    %51 = tt.splat %50 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
+    tt.store %51, %31 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
+    tt.return
+  }
+}

.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ptx ADDED Viewed

	@@ -0,0 +1,988 @@

+//
+// Generated by LLVM NVPTX Back-End
+//
+.version 8.2
+.target sm_89
+.address_size 64
+	// .globl	triton__0d1d2d3d4d5de6de
+.extern .func __assertfail
+(
+	.param .b64 __assertfail_param_0,
+	.param .b64 __assertfail_param_1,
+	.param .b32 __assertfail_param_2,
+	.param .b64 __assertfail_param_3,
+	.param .b64 __assertfail_param_4
+)
+;
+.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 51, 32, 60, 32, 53, 48, 50, 53, 55};
+.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
+.extern .shared .align 1 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+.visible .entry triton__0d1d2d3d4d5de6de(
+	.param .u64 triton__0d1d2d3d4d5de6de_param_0,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_1,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_2,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_3,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_4,
+	.param .u32 triton__0d1d2d3d4d5de6de_param_5,
+	.param .u32 triton__0d1d2d3d4d5de6de_param_6
+)
+.maxntid 128, 1, 1
+{
+	.reg .pred 	%p<50>;
+	.reg .b16 	%rs<5>;
+	.reg .b32 	%r<169>;
+	.reg .f32 	%f<153>;
+	.reg .b64 	%rd<53>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+	ld.param.u64 	%rd6, [triton__0d1d2d3d4d5de6de_param_3];
+	ld.param.u64 	%rd5, [triton__0d1d2d3d4d5de6de_param_1];
+	ld.param.u64 	%rd19, [triton__0d1d2d3d4d5de6de_param_0];
+$L__tmp0:
+	.loc	1 22 44
+	mov.u32 	%r1, %tid.x;
+	and.b32  	%r2, %r1, 31;
+	ld.param.u64 	%rd20, [triton__0d1d2d3d4d5de6de_param_2];
+	bfe.u32 	%r3, %r1, 6, 1;
+	and.b32  	%r4, %r1, 1;
+	.loc	1 24 33
+	bfe.u32 	%r5, %r1, 5, 1;
+	shl.b32 	%r24, %r1, 2;
+	and.b32  	%r6, %r24, 252;
+	shl.b32 	%r25, %r1, 1;
+	and.b32  	%r7, %r25, 254;
+	.loc	1 21 28
+	mov.u32 %r15, %ctaid.x;
+	.loc	1 21 33
+	shl.b32 	%r26, %r15, 1;
+	.loc	1 22 23
+	or.b32  	%r8, %r26, %r3;
+	or.b32  	%r27, %r26, %r4;
+	.loc	1 26 30
+	mul.wide.s32 	%rd21, %r8, 8;
+	add.s64 	%rd9, %rd19, %rd21;
+	mul.wide.s32 	%rd22, %r27, 8;
+	add.s64 	%rd17, %rd19, %rd22;
+	mov.pred 	%p44, -1;
+	.loc	1 26 35
+	mov.u64 %rd8, 0x0;
+	@%p44 ld.global.L1::evict_last.b64 { %rd8 }, [ %rd9 + 0 ];
+	mov.u64 %rd10, 0x0;
+	@%p44 ld.global.L1::evict_last.b64 { %rd10 }, [ %rd9 + 0 ];
+	mov.u64 %rd12, 0x0;
+	@%p44 ld.global.L1::evict_last.b64 { %rd12 }, [ %rd9 + 0 ];
+	mov.u64 %rd14, 0x0;
+	@%p44 ld.global.L1::evict_last.b64 { %rd14 }, [ %rd9 + 0 ];
+	mov.u64 %rd16, 0x0;
+	@%p44 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd17 + 0 ];
+	.loc	1 27 18
+	bfe.s32 	%r28, %r15, 30, 1;
+	shr.u32 	%r29, %r28, 23;
+	add.s32 	%r30, %r8, %r29;
+	and.b32  	%r31, %r30, 16776704;
+	sub.s32 	%r32, %r8, %r31;
+	.loc	1 35 44
+	shl.b32 	%r33, %r32, 8;
+	.loc	1 35 40
+	or.b32  	%r34, %r33, %r6;
+	.loc	1 35 34
+	mul.wide.s32 	%rd23, %r34, 4;
+	add.s64 	%rd33, %rd20, %rd23;
+	mov.b32 	%r137, 0;
+	.loc	1 35 50
+	mov.u32 %r16, 0x0;
+	mov.u32 %r17, 0x0;
+	mov.u32 %r18, 0x0;
+	mov.u32 %r19, 0x0;
+	@%p44 ld.global.L1::evict_last.v4.b32 { %r16, %r17, %r18, %r19 }, [ %rd33 + 0 ];
+	@!%p44 mov.u32 %r16, %r137;
+	@!%p44 mov.u32 %r17, %r137;
+	@!%p44 mov.u32 %r18, %r137;
+	@!%p44 mov.u32 %r19, %r137;
+	mov.b32 	%f1, %r16;
+	mov.b32 	%f2, %r17;
+	mov.b32 	%f3, %r18;
+	mov.b32 	%f4, %r19;
+	.loc	1 36 22
+	add.s64 	%rd24, %rd16, 50257;
+	.loc	1 37 22
+	setp.lt.s64 	%p11, %rd16, 0;
+	.loc	1 38 36
+	selp.b64 	%rd3, %rd24, %rd16, %p11;
+	.loc	1 39 40
+	setp.lt.u64 	%p12, %rd3, 50257;
+	mov.b32 	%r168, 883;
+	mov.u64 	%rd52, 1;
+	.loc	1 39 55
+	@%p12 bra 	$L__BB0_2;
+	mov.u64 	%rd25, assertMessage_0;
+	cvta.global.u64 	%rd26, %rd25;
+	mov.u64 	%rd27, assertFile_0;
+	cvta.global.u64 	%rd28, %rd27;
+	mov.u64 	%rd29, assertFunc_0;
+	cvta.global.u64 	%rd30, %rd29;
+	{ // callseq 4, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd26;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd28;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r168;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd30;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd52;
+	call.uni
+	__assertfail,
+	(
+	param0,
+	param1,
+	param2,
+	param3,
+	param4
+	);
+	} // callseq 4
+$L__BB0_2:
+	.loc	1 0 55
+	ld.param.u64 	%rd7, [triton__0d1d2d3d4d5de6de_param_4];
+	.loc	1 37 22
+	setp.lt.s64 	%p36, %rd8, 0;
+	.loc	1 40 44
+	shl.b64 	%rd35, %rd8, 8;
+	add.s64 	%rd36, %rd35, 12865792;
+	selp.b64 	%rd37, %rd36, %rd35, %p36;
+	cvt.u64.u32 	%rd38, %r6;
+	.loc	1 40 40
+	or.b64  	%rd39, %rd37, %rd38;
+	.loc	1 40 34
+	shl.b64 	%rd40, %rd39, 2;
+	add.s64 	%rd49, %rd5, %rd40;
+	.loc	1 40 52
+	mov.u32 %r36, 0x0;
+	mov.u32 %r37, 0x0;
+	mov.u32 %r38, 0x0;
+	mov.u32 %r39, 0x0;
+	@%p44 ld.global.L1::evict_last.v4.b32 { %r36, %r37, %r38, %r39 }, [ %rd49 + 0 ];
+	@!%p44 mov.u32 %r36, %r137;
+	@!%p44 mov.u32 %r37, %r137;
+	@!%p44 mov.u32 %r38, %r137;
+	@!%p44 mov.u32 %r39, %r137;
+	mov.b32 	%f7, %r36;
+	mov.b32 	%f8, %r37;
+	mov.b32 	%f9, %r38;
+	mov.b32 	%f10, %r39;
+	.loc	1 41 22
+	add.f32 	%f11, %f1, %f7;
+	add.f32 	%f12, %f2, %f8;
+	add.f32 	%f13, %f3, %f9;
+	add.f32 	%f14, %f4, %f10;
+$L__tmp1:
+	.loc	2 98 22
+	add.f32 	%f15, %f11, 0f00000000;
+	add.f32 	%f16, %f12, 0f00000000;
+	add.f32 	%f17, %f13, 0f00000000;
+	add.f32 	%f18, %f14, 0f00000000;
+	.loc	2 101 30
+	sub.f32 	%f19, %f11, %f15;
+	sub.f32 	%f20, %f12, %f16;
+	sub.f32 	%f21, %f13, %f17;
+	sub.f32 	%f22, %f14, %f18;
+	.loc	2 101 13
+	fma.rn.f32 	%f23, %f11, %f19, 0f00000000;
+	fma.rn.f32 	%f24, %f12, %f20, 0f00000000;
+	fma.rn.f32 	%f25, %f13, %f21, 0f00000000;
+	fma.rn.f32 	%f26, %f14, %f22, 0f00000000;
+$L__tmp2:
+	.loc	2 108 21
+	sub.f32 	%f27, %f16, %f15;
+	mov.b32 	%r45, 1065353216;
+	mov.b32 	%r46, 1073741824;
+	.loc	2 110 60
+	div.full.f32 %r44, %r45, %r46;
+	mov.b32 	%f28, %r44;
+	.loc	2 112 17
+	fma.rn.f32 	%f29, %f28, %f27, %f15;
+	.loc	2 113 15
+	add.f32 	%f30, %f23, %f24;
+	.loc	2 113 30
+	mul.f32 	%f31, %f27, %f27;
+	.loc	2 113 22
+	fma.rn.f32 	%f32, %f28, %f31, %f30;
+	.loc	2 108 21
+	sub.f32 	%f33, %f17, %f29;
+	mov.b32 	%r49, 1077936128;
+	.loc	2 110 60
+	div.full.f32 %r47, %r45, %r49;
+	mov.b32 	%f34, %r47;
+	.loc	2 112 17
+	fma.rn.f32 	%f35, %f34, %f33, %f29;
+	.loc	2 113 15
+	add.f32 	%f36, %f25, %f32;
+	.loc	2 113 30
+	mul.f32 	%f37, %f33, %f33;
+	.loc	2 113 38
+	fma.rn.f32 	%f38, %f33, %f33, %f37;
+	.loc	2 113 22
+	fma.rn.f32 	%f39, %f34, %f38, %f36;
+	.loc	2 108 21
+	sub.f32 	%f40, %f18, %f35;
+	mov.b32 	%r52, 1082130432;
+	.loc	2 110 60
+	div.full.f32 %r50, %r45, %r52;
+	mov.b32 	%f41, %r50;
+	.loc	2 112 17
+	fma.rn.f32 	%f42, %f41, %f40, %f35;
+	.loc	2 113 15
+	add.f32 	%f43, %f26, %f39;
+	.loc	2 113 30
+	mul.f32 	%f44, %f40, %f40;
+	.loc	2 113 38
+	mul.f32 	%f45, %f44, 0f40400000;
+	.loc	2 113 22
+	fma.rn.f32 	%f46, %f41, %f45, %f43;
+$L__tmp3:
+	.loc	2 120 46
+	mov.b32 	%r101, %f42;
+	shfl.sync.bfly.b32	%r102, %r101, 16, 31, -1;
+	mov.b32 	%f47, %r102;
+	mov.b32 	%r103, %f46;
+	shfl.sync.bfly.b32	%r104, %r103, 16, 31, -1;
+	mov.b32 	%f48, %r104;
+	shfl.sync.bfly.b32	%r54, %r52, 16, 31, -1;
+	mov.b32 	%f49, %r54;
+$L__tmp4:
+	.loc	2 108 21
+	sub.f32 	%f50, %f47, %f42;
+	.loc	2 109 28
+	add.f32 	%f51, %f49, 0f40800000;
+	.loc	2 110 39
+	setp.eq.f32 	%p37, %f51, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r55, %f51;
+	div.full.f32 %r53, %r54, %r55;
+	mov.b32 	%f52, %r53;
+	.loc	2 110 49
+	selp.f32 	%f53, 0f00000000, %f52, %p37;
+	.loc	2 112 17
+	fma.rn.f32 	%f54, %f53, %f50, %f42;
+	.loc	2 113 15
+	add.f32 	%f55, %f46, %f48;
+	.loc	2 113 30
+	mul.f32 	%f56, %f50, %f50;
+	.loc	2 113 38
+	mul.f32 	%f57, %f56, 0f40800000;
+	.loc	2 113 22
+	fma.rn.f32 	%f58, %f53, %f57, %f55;
+$L__tmp5:
+	.loc	2 120 46
+	mov.b32 	%r105, %f54;
+	shfl.sync.bfly.b32	%r106, %r105, 8, 31, -1;
+	mov.b32 	%f59, %r106;
+	mov.b32 	%r107, %f58;
+	shfl.sync.bfly.b32	%r108, %r107, 8, 31, -1;
+	mov.b32 	%f60, %r108;
+	shfl.sync.bfly.b32	%r57, %r55, 8, 31, -1;
+	mov.b32 	%f61, %r57;
+$L__tmp6:
+	.loc	2 108 21
+	sub.f32 	%f62, %f59, %f54;
+	.loc	2 109 28
+	add.f32 	%f63, %f51, %f61;
+	.loc	2 110 39
+	setp.eq.f32 	%p38, %f63, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r58, %f63;
+	div.full.f32 %r56, %r57, %r58;
+	mov.b32 	%f64, %r56;
+	.loc	2 110 49
+	selp.f32 	%f65, 0f00000000, %f64, %p38;
+	.loc	2 112 17
+	fma.rn.f32 	%f66, %f65, %f62, %f54;
+	.loc	2 113 15
+	add.f32 	%f67, %f58, %f60;
+	.loc	2 113 30
+	mul.f32 	%f68, %f62, %f62;
+	.loc	2 113 38
+	mul.f32 	%f69, %f51, %f68;
+	.loc	2 113 22
+	fma.rn.f32 	%f70, %f65, %f69, %f67;
+$L__tmp7:
+	.loc	2 120 46
+	mov.b32 	%r109, %f66;
+	shfl.sync.bfly.b32	%r110, %r109, 4, 31, -1;
+	mov.b32 	%f71, %r110;
+	mov.b32 	%r111, %f70;
+	shfl.sync.bfly.b32	%r112, %r111, 4, 31, -1;
+	mov.b32 	%f72, %r112;
+	shfl.sync.bfly.b32	%r60, %r58, 4, 31, -1;
+	mov.b32 	%f73, %r60;
+$L__tmp8:
+	.loc	2 108 21
+	sub.f32 	%f74, %f71, %f66;
+	.loc	2 109 28
+	add.f32 	%f75, %f63, %f73;
+	.loc	2 110 39
+	setp.eq.f32 	%p39, %f75, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r61, %f75;
+	div.full.f32 %r59, %r60, %r61;
+	mov.b32 	%f76, %r59;
+	.loc	2 110 49
+	selp.f32 	%f77, 0f00000000, %f76, %p39;
+	.loc	2 112 17
+	fma.rn.f32 	%f78, %f77, %f74, %f66;
+	.loc	2 113 15
+	add.f32 	%f79, %f70, %f72;
+	.loc	2 113 30
+	mul.f32 	%f80, %f74, %f74;
+	.loc	2 113 38
+	mul.f32 	%f81, %f63, %f80;
+	.loc	2 113 22
+	fma.rn.f32 	%f82, %f77, %f81, %f79;
+$L__tmp9:
+	.loc	2 120 46
+	mov.b32 	%r113, %f78;
+	shfl.sync.bfly.b32	%r114, %r113, 2, 31, -1;
+	mov.b32 	%f83, %r114;
+	mov.b32 	%r115, %f82;
+	shfl.sync.bfly.b32	%r116, %r115, 2, 31, -1;
+	mov.b32 	%f84, %r116;
+	shfl.sync.bfly.b32	%r63, %r61, 2, 31, -1;
+	mov.b32 	%f85, %r63;
+$L__tmp10:
+	.loc	2 108 21
+	sub.f32 	%f86, %f83, %f78;
+	.loc	2 109 28
+	add.f32 	%f87, %f75, %f85;
+	.loc	2 110 39
+	setp.eq.f32 	%p40, %f87, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r64, %f87;
+	div.full.f32 %r62, %r63, %r64;
+	mov.b32 	%f88, %r62;
+	.loc	2 110 49
+	selp.f32 	%f89, 0f00000000, %f88, %p40;
+	.loc	2 112 17
+	fma.rn.f32 	%f90, %f89, %f86, %f78;
+	.loc	2 113 15
+	add.f32 	%f91, %f82, %f84;
+	.loc	2 113 30
+	mul.f32 	%f92, %f86, %f86;
+	.loc	2 113 38
+	mul.f32 	%f93, %f75, %f92;
+	.loc	2 113 22
+	fma.rn.f32 	%f94, %f89, %f93, %f91;
+$L__tmp11:
+	.loc	2 120 46
+	mov.b32 	%r117, %f90;
+	shfl.sync.bfly.b32	%r118, %r117, 1, 31, -1;
+	mov.b32 	%f95, %r118;
+	mov.b32 	%r119, %f94;
+	shfl.sync.bfly.b32	%r120, %r119, 1, 31, -1;
+	mov.b32 	%f96, %r120;
+	shfl.sync.bfly.b32	%r66, %r64, 1, 31, -1;
+	mov.b32 	%f97, %r66;
+$L__tmp12:
+	.loc	2 108 21
+	sub.f32 	%f98, %f95, %f90;
+	.loc	2 109 28
+	add.f32 	%f99, %f87, %f97;
+	.loc	2 110 39
+	setp.eq.f32 	%p41, %f99, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r67, %f99;
+	div.full.f32 %r65, %r66, %r67;
+	mov.b32 	%f100, %r65;
+	.loc	2 110 49
+	selp.f32 	%f101, 0f00000000, %f100, %p41;
+	.loc	2 112 17
+	fma.rn.f32 	%f102, %f98, %f101, %f90;
+	.loc	2 113 15
+	add.f32 	%f103, %f94, %f96;
+	.loc	2 113 30
+	mul.f32 	%f104, %f98, %f98;
+	.loc	2 113 38
+	mul.f32 	%f105, %f87, %f104;
+	.loc	2 113 22
+	fma.rn.f32 	%f106, %f101, %f105, %f103;
+$L__tmp13:
+	.loc	2 120 46
+	setp.eq.s32 	%p18, %r2, 0;
+	shl.b32 	%r121, %r5, 2;
+	shl.b32 	%r122, %r3, 3;
+	or.b32  	%r123, %r122, %r121;
+	mov.u32 	%r124, global_smem;
+	add.s32 	%r68, %r124, %r123;
+	mov.b32 	%r69, %f102;
+	@%p18 st.shared.b32 [ %r68 + 0 ], %r69;
+	add.s32 	%r125, %r124, 16;
+	add.s32 	%r70, %r125, %r123;
+	mov.b32 	%r71, %f106;
+	@%p18 st.shared.b32 [ %r70 + 0 ], %r71;
+	add.s32 	%r126, %r124, 32;
+	add.s32 	%r72, %r126, %r123;
+	@%p18 st.shared.b32 [ %r72 + 0 ], %r67;
+	bar.sync 	0;
+	setp.lt.s32 	%p21, %r1, 4;
+	add.s32 	%r75, %r124, %r24;
+	@%p21 ld.shared.b32 %r74, [ %r75 + 0 ];
+	mov.b32 	%f107, %r74;
+	add.s32 	%r77, %r125, %r24;
+	@%p21 ld.shared.b32 %r76, [ %r77 + 0 ];
+	mov.b32 	%f108, %r76;
+	add.s32 	%r79, %r126, %r24;
+	@%p21 ld.shared.b32 %r78, [ %r79 + 0 ];
+	mov.b32 	%f109, %r78;
+	shfl.sync.bfly.b32	%r128, %r74, 1, 31, -1;
+	mov.b32 	%f110, %r128;
+	shfl.sync.bfly.b32	%r129, %r76, 1, 31, -1;
+	mov.b32 	%f111, %r129;
+	shfl.sync.bfly.b32	%r81, %r78, 1, 31, -1;
+	mov.b32 	%f112, %r81;
+$L__tmp14:
+	.loc	2 108 21
+	sub.f32 	%f113, %f110, %f107;
+	.loc	2 109 28
+	add.f32 	%f114, %f109, %f112;
+	.loc	2 110 39
+	setp.eq.f32 	%p42, %f114, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r82, %f114;
+	div.full.f32 %r80, %r81, %r82;
+	mov.b32 	%f115, %r80;
+	.loc	2 110 49
+	selp.f32 	%f116, 0f00000000, %f115, %p42;
+	.loc	2 112 17
+	fma.rn.f32 	%f117, %f113, %f116, %f107;
+	.loc	2 113 15
+	add.f32 	%f118, %f108, %f111;
+	.loc	2 113 30
+	mul.f32 	%f119, %f113, %f113;
+	.loc	2 113 38
+	mul.f32 	%f120, %f109, %f119;
+	.loc	2 113 22
+	fma.rn.f32 	%f121, %f120, %f116, %f118;
+$L__tmp15:
+	.loc	2 120 46
+	setp.eq.s32 	%p43, %r4, 0;
+	and.pred  	%p24, %p21, %p43;
+	mov.b32 	%r84, %f117;
+	@%p24 st.shared.b32 [ %r75 + 0 ], %r84;
+	mov.b32 	%r86, %f121;
+	@%p24 st.shared.b32 [ %r77 + 0 ], %r86;
+	@%p24 st.shared.b32 [ %r79 + 0 ], %r82;
+	bar.sync 	0;
+	add.s32 	%r130, %r124, %r122;
+	ld.shared.f32 	%f5, [%r130];
+	add.s32 	%r131, %r125, %r122;
+	ld.shared.f32 	%f6, [%r131];
+$L__tmp16:
+	.loc	1 59 51
+	mov.u32 %r89, 0x0;
+	mov.u32 %r90, 0x0;
+	mov.u32 %r91, 0x0;
+	mov.u32 %r92, 0x0;
+	@%p44 ld.global.L1::evict_last.v4.b32 { %r89, %r90, %r91, %r92 }, [ %rd33 + 0 ];
+	@!%p44 mov.u32 %r89, %r137;
+	@!%p44 mov.u32 %r90, %r137;
+	@!%p44 mov.u32 %r91, %r137;
+	@!%p44 mov.u32 %r92, %r137;
+	.loc	1 60 35
+	mul.wide.u32 	%rd41, %r7, 4;
+	add.s64 	%rd34, %rd6, %rd41;
+	.loc	1 60 40
+	mov.u32 %r97, 0x0;
+	mov.u32 %r98, 0x0;
+	@%p44 ld.global.L1::evict_last.v2.b32 { %r97, %r98 }, [ %rd34 + 0 ];
+	@!%p44 mov.u32 %r97, %r137;
+	@!%p44 mov.u32 %r98, %r137;
+	.loc	1 64 57
+	@%p12 bra 	$L__BB0_4;
+	mov.u64 	%rd42, assertMessage_1;
+	cvta.global.u64 	%rd43, %rd42;
+	mov.u64 	%rd44, assertFile_1;
+	cvta.global.u64 	%rd45, %rd44;
+	mov.u64 	%rd46, assertFunc_1;
+	cvta.global.u64 	%rd47, %rd46;
+	{ // callseq 5, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd43;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd45;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r168;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd47;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd52;
+	call.uni
+	__assertfail,
+	(
+	param0,
+	param1,
+	param2,
+	param3,
+	param4
+	);
+	} // callseq 5
+$L__BB0_4:
+	.loc	1 65 54
+	mov.u32 %r133, 0x0;
+	mov.u32 %r134, 0x0;
+	mov.u32 %r135, 0x0;
+	mov.u32 %r136, 0x0;
+	@%p44 ld.global.L1::evict_first.v4.b32 { %r133, %r134, %r135, %r136 }, [ %rd49 + 0 ];
+	@!%p44 mov.u32 %r133, %r137;
+	@!%p44 mov.u32 %r134, %r137;
+	@!%p44 mov.u32 %r135, %r137;
+	@!%p44 mov.u32 %r136, %r137;
+	.loc	1 69 23
+	mov.b32 	%r142, %f6;
+	mov.b32 	%r143, 1132462080;
+	div.full.f32 %r141, %r142, %r143;
+	mov.b32 	%f122, %r141;
+	.loc	1 71 24
+	add.f32 	%f123, %f122, 0f3727C5AC;
+	.loc	1 72 30
+	rsqrt.approx.ftz.f32 	%f124, %f123;
+	.loc	1 65 54
+	mov.b32 	%f125, %r136;
+	.loc	1 59 51
+	mov.b32 	%f126, %r92;
+	.loc	1 66 24
+	add.f32 	%f127, %f126, %f125;
+	.loc	1 67 24
+	sub.f32 	%f128, %f127, %f5;
+	.loc	1 65 54
+	mov.b32 	%f129, %r135;
+	.loc	1 59 51
+	mov.b32 	%f130, %r91;
+	.loc	1 66 24
+	add.f32 	%f131, %f130, %f129;
+	.loc	1 67 24
+	sub.f32 	%f132, %f131, %f5;
+	.loc	1 65 54
+	mov.b32 	%f133, %r134;
+	.loc	1 59 51
+	mov.b32 	%f134, %r90;
+	.loc	1 66 24
+	add.f32 	%f135, %f134, %f133;
+	.loc	1 67 24
+	sub.f32 	%f136, %f135, %f5;
+	.loc	1 65 54
+	mov.b32 	%f137, %r133;
+	.loc	1 59 51
+	mov.b32 	%f138, %r89;
+	.loc	1 66 24
+	add.f32 	%f139, %f138, %f137;
+	.loc	1 67 24
+	sub.f32 	%f140, %f139, %f5;
+	.loc	1 73 24
+	mul.f32 	%f141, %f140, %f124;
+	mul.f32 	%f142, %f136, %f124;
+	mul.f32 	%f143, %f132, %f124;
+	mul.f32 	%f144, %f128, %f124;
+	.loc	1 74 24
+	bar.sync 	0;
+	shl.b32 	%r159, %r7, 2;
+	add.s32 	%r161, %r124, %r159;
+	st.shared.v2.u32 	[%r161], {%r97, %r98};
+	bar.sync 	0;
+	shl.b32 	%r162, %r6, 2;
+	add.s32 	%r163, %r124, %r162;
+	ld.shared.v4.f32 	{%f145, %f146, %f147, %f148}, [%r163];
+	mul.f32 	%f149, %f141, %f145;
+	mul.f32 	%f150, %f142, %f146;
+	mul.f32 	%f151, %f143, %f147;
+	mul.f32 	%f152, %f144, %f148;
+	.loc	1 76 39
+	shl.b32 	%r164, %r8, 8;
+	.loc	1 76 35
+	or.b32  	%r165, %r164, %r6;
+	.loc	1 76 29
+	mul.wide.s32 	%rd51, %r165, 2;
+	add.s64 	%rd50, %rd7, %rd51;
+	.loc	1 76 52
+	mov.b32 	%r153, %f149;
+	cvt.rn.bf16.f32 %rs1, %r153;
+	mov.b32 	%r154, %f150;
+	cvt.rn.bf16.f32 %rs2, %r154;
+	mov.b32 	%r155, %f151;
+	cvt.rn.bf16.f32 %rs3, %r155;
+	mov.b32 	%r156, %f152;
+	cvt.rn.bf16.f32 %rs4, %r156;
+	mov.b32 	%r166, {%rs1, %rs2};
+	mov.b32 	%r167, {%rs3, %rs4};
+	@%p44 st.global.v2.b32 [ %rd50 + 0 ], { %r166, %r167 };
+	.loc	1 55 4
+	ret;
+$L__tmp17:
+$L__func_end0:
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+}
+	.file	1 "/tmp/torchinductor_root/lh/clhe4a3stvufxafmq3kk5hodazz2efctffte646znjdnv3lqi5oa.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 298
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 108
+.b8 104
+.b8 101
+.b8 52
+.b8 97
+.b8 51
+.b8 115
+.b8 116
+.b8 118
+.b8 117
+.b8 102
+.b8 120
+.b8 97
+.b8 102
+.b8 109
+.b8 113
+.b8 51
+.b8 107
+.b8 107
+.b8 53
+.b8 104
+.b8 111
+.b8 100
+.b8 97
+.b8 122
+.b8 122
+.b8 50
+.b8 101
+.b8 102
+.b8 99
+.b8 116
+.b8 102
+.b8 102
+.b8 116
+.b8 101
+.b8 54
+.b8 52
+.b8 54
+.b8 122
+.b8 110
+.b8 106
+.b8 100
+.b8 110
+.b8 118
+.b8 51
+.b8 108
+.b8 113
+.b8 105
+.b8 53
+.b8 111
+.b8 97
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 108
+.b8 104
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp2
+.b8 2
+.b8 44
+.b8 38
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp15
+.b8 2
+.b8 50
+.b8 41
+.b8 4
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp15
+.b8 2
+.b8 120
+.b8 46
+.b8 0
+.b8 4
+.b32 125
+.b64 $L__tmp3
+.b64 $L__tmp16
+.b8 2
+.b8 50
+.b8 41
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 302
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 302
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}

.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ttir ADDED Viewed

	@@ -0,0 +1,104 @@

+module {
+  tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<1.000000e+00> : tensor<1x256xf32>
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x256xf32>
+    %cst_1 = arith.constant 0.000000e+00 : f32
+    %cst_2 = arith.constant dense<256> : tensor<2x1xi64>
+    %cst_3 = arith.constant dense<50257> : tensor<2x1xi64>
+    %cst_4 = arith.constant dense<0> : tensor<2x1xi64>
+    %cst_5 = arith.constant dense<9.99999974E-6> : tensor<2x1xf32>
+    %cst_6 = arith.constant dense<2.560000e+02> : tensor<2x1xf32>
+    %cst_7 = arith.constant dense<0.000000e+00> : tensor<2x256xf32>
+    %cst_8 = arith.constant dense<256> : tensor<2x1xi32>
+    %cst_9 = arith.constant dense<256> : tensor<1x256xi32>
+    %cst_10 = arith.constant dense<512> : tensor<2x1xi32>
+    %c2_i32 = arith.constant 2 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c2_i32 : i32
+    %2 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32>
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<2xi32>) -> tensor<2x1xi32>
+    %4 = tt.splat %1 : (i32) -> tensor<2x1xi32>
+    %5 = arith.addi %4, %3 : tensor<2x1xi32>
+    %6 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
+    %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<256xi32>) -> tensor<1x256xi32>
+    %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>>
+    %9 = tt.addptr %8, %5 : tensor<2x1x!tt.ptr<i64, 1>>, tensor<2x1xi32>
+    %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64>
+    %11 = arith.remsi %5, %cst_10 : tensor<2x1xi32>
+    %12 = arith.cmpi slt, %7, %cst_9 : tensor<1x256xi32>
+    %13 = arith.muli %11, %cst_8 : tensor<2x1xi32>
+    %14 = tt.broadcast %7 : (tensor<1x256xi32>) -> tensor<2x256xi32>
+    %15 = tt.broadcast %13 : (tensor<2x1xi32>) -> tensor<2x256xi32>
+    %16 = arith.addi %14, %15 : tensor<2x256xi32>
+    %17 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>>
+    %18 = tt.addptr %17, %16 : tensor<2x256x!tt.ptr<f32, 1>>, tensor<2x256xi32>
+    %19 = tt.broadcast %12 : (tensor<1x256xi1>) -> tensor<2x256xi1>
+    %20 = tt.load %18, %19, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
+    %21 = arith.addi %10, %cst_3 : tensor<2x1xi64>
+    %22 = arith.cmpi slt, %10, %cst_4 : tensor<2x1xi64>
+    %23 = arith.select %22, %21, %10 : tensor<2x1xi1>, tensor<2x1xi64>
+    %24 = arith.cmpi sge, %23, %cst_4 : tensor<2x1xi64>
+    %25 = arith.cmpi slt, %23, %cst_3 : tensor<2x1xi64>
+    %26 = arith.andi %24, %25 : tensor<2x1xi1>
+    tt.assert %26, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1>
+    %27 = arith.muli %23, %cst_2 : tensor<2x1xi64>
+    %28 = tt.broadcast %27 : (tensor<2x1xi64>) -> tensor<2x256xi64>
+    %29 = arith.extsi %7 : tensor<1x256xi32> to tensor<1x256xi64>
+    %30 = tt.broadcast %29 : (tensor<1x256xi64>) -> tensor<2x256xi64>
+    %31 = arith.addi %30, %28 : tensor<2x256xi64>
+    %32 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>>
+    %33 = tt.addptr %32, %31 : tensor<2x256x!tt.ptr<f32, 1>>, tensor<2x256xi64>
+    %34 = tt.load %33, %19, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
+    %35 = arith.addf %34, %20 : tensor<2x256xf32>
+    %36 = arith.addf %35, %cst_7 : tensor<2x256xf32>
+    %37 = arith.subf %35, %36 : tensor<2x256xf32>
+    %38 = arith.mulf %35, %37 : tensor<2x256xf32>
+    %39 = arith.addf %38, %cst_7 : tensor<2x256xf32>
+    %40 = arith.select %19, %36, %cst_7 : tensor<2x256xi1>, tensor<2x256xf32>
+    %41 = arith.select %19, %39, %cst_7 : tensor<2x256xi1>, tensor<2x256xf32>
+    %42 = arith.select %12, %cst, %cst_0 : tensor<1x256xi1>, tensor<1x256xf32>
+    %43 = tt.broadcast %42 : (tensor<1x256xf32>) -> tensor<2x256xf32>
+    %44:3 = "tt.reduce"(%40, %41, %43) <{axis = 1 : i32}> ({
+    ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
+      %68 = arith.subf %arg10, %arg7 : f32
+      %69 = arith.addf %arg9, %arg12 : f32
+      %70 = arith.cmpf oeq, %69, %cst_1 : f32
+      %71 = arith.divf %arg12, %69 : f32
+      %72 = arith.select %70, %cst_1, %71 : f32
+      %73 = arith.mulf %68, %72 : f32
+      %74 = arith.addf %arg7, %73 : f32
+      %75 = arith.addf %arg8, %arg11 : f32
+      %76 = arith.mulf %68, %68 : f32
+      %77 = arith.mulf %76, %arg9 : f32
+      %78 = arith.mulf %77, %72 : f32
+      %79 = arith.addf %75, %78 : f32
+      tt.reduce.return %74, %79, %69 : f32, f32, f32
+    }) : (tensor<2x256xf32>, tensor<2x256xf32>, tensor<2x256xf32>) -> (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>)
+    %45 = tt.expand_dims %44#0 {axis = 1 : i32} : (tensor<2xf32>) -> tensor<2x1xf32>
+    %46 = tt.expand_dims %44#1 {axis = 1 : i32} : (tensor<2xf32>) -> tensor<2x1xf32>
+    %47 = tt.load %18, %19, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
+    %48 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>>
+    %49 = tt.addptr %48, %7 : tensor<1x256x!tt.ptr<f32, 1>>, tensor<1x256xi32>
+    %50 = tt.load %49, %12, %cst_0 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
+    tt.assert %26, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1>
+    %51 = tt.load %33, %19, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xf32>
+    %52 = arith.addf %51, %47 : tensor<2x256xf32>
+    %53 = tt.broadcast %45 : (tensor<2x1xf32>) -> tensor<2x256xf32>
+    %54 = arith.subf %52, %53 : tensor<2x256xf32>
+    %55 = arith.divf %46, %cst_6 : tensor<2x1xf32>
+    %56 = arith.addf %55, %cst_5 : tensor<2x1xf32>
+    %57 = tt.extern_elementwise %56 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32>
+    %58 = tt.broadcast %57 : (tensor<2x1xf32>) -> tensor<2x256xf32>
+    %59 = arith.mulf %54, %58 : tensor<2x256xf32>
+    %60 = tt.broadcast %50 : (tensor<1x256xf32>) -> tensor<2x256xf32>
+    %61 = arith.mulf %59, %60 : tensor<2x256xf32>
+    %62 = arith.muli %5, %cst_8 : tensor<2x1xi32>
+    %63 = tt.broadcast %62 : (tensor<2x1xi32>) -> tensor<2x256xi32>
+    %64 = arith.addi %14, %63 : tensor<2x256xi32>
+    %65 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>>
+    %66 = tt.addptr %65, %64 : tensor<2x256x!tt.ptr<bf16, 1>>, tensor<2x256xi32>
+    %67 = arith.truncf %61 : tensor<2x256xf32> to tensor<2x256xbf16>
+    tt.store %66, %67, %19 {cache = 1 : i32, evict = 1 : i32} : tensor<2x256xbf16>
+    tt.return
+  }
+}

.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.ptx ADDED Viewed

	@@ -0,0 +1,782 @@

+//
+// Generated by LLVM NVPTX Back-End
+//
+.version 8.2
+.target sm_89
+.address_size 64
+	// .globl	triton__0d1d2d3d4d5d6d7d8de9de
+.extern .shared .align 1 .b8 global_smem[];
+.visible .entry triton__0d1d2d3d4d5d6d7d8de9de(
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_0,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_1,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_2,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_3,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_4,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_5,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_6,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_7,
+	.param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_8,
+	.param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_9
+)
+.maxntid 64, 1, 1
+{
+	.reg .pred 	%p<45>;
+	.reg .b16 	%rs<5>;
+	.reg .b32 	%r<106>;
+	.reg .f32 	%f<90>;
+	.reg .b64 	%rd<44>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+	ld.param.u64 	%rd25, [triton__0d1d2d3d4d5d6d7d8de9de_param_0];
+	ld.param.u64 	%rd26, [triton__0d1d2d3d4d5d6d7d8de9de_param_1];
+$L__tmp0:
+	.loc	1 26 26
+	mov.u32 	%r74, %tid.x;
+	and.b32  	%r75, %r74, 31;
+	ld.param.u64 	%rd27, [triton__0d1d2d3d4d5d6d7d8de9de_param_2];
+	ld.param.u64 	%rd28, [triton__0d1d2d3d4d5d6d7d8de9de_param_3];
+	ld.param.u64 	%rd29, [triton__0d1d2d3d4d5d6d7d8de9de_param_4];
+	shl.b32 	%r76, %r74, 2;
+	ld.param.u64 	%rd30, [triton__0d1d2d3d4d5d6d7d8de9de_param_5];
+	and.b32  	%r77, %r76, 252;
+	ld.param.u64 	%rd31, [triton__0d1d2d3d4d5d6d7d8de9de_param_6];
+	ld.param.u64 	%rd32, [triton__0d1d2d3d4d5d6d7d8de9de_param_7];
+	.loc	1 23 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 30 40
+	shl.b32 	%r78, %r1, 8;
+	.loc	1 30 36
+	or.b32  	%r79, %r78, %r77;
+	.loc	1 30 30
+	mul.wide.s32 	%rd33, %r79, 2;
+	add.s64 	%rd1, %rd26, %rd33;
+	mov.b32 	%r4, 0;
+	mov.pred 	%p1, -1;
+	.loc	1 30 46
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	@%p1 ld.global.v2.b32 { %r2, %r3 }, [ %rd1 + 0 ];
+	@!%p1 mov.u32 %r2, %r4;
+	@!%p1 mov.u32 %r3, %r4;
+	cvt.u16.u32 	%rs1, %r2;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
+	cvt.u16.u32 	%rs3, %r3;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
+	.loc	1 30 67
+	cvt.f32.bf16 %r6, %rs1;
+	mov.b32 	%f1, %r6;
+	cvt.f32.bf16 %r7, %rs2;
+	mov.b32 	%f2, %r7;
+	cvt.f32.bf16 %r8, %rs3;
+	mov.b32 	%f3, %r8;
+	cvt.f32.bf16 %r9, %rs4;
+	mov.b32 	%f4, %r9;
+	.loc	1 31 30
+	cvt.u64.u32 	%rd34, %r77;
+	mul.wide.u32 	%rd35, %r77, 4;
+	add.s64 	%rd2, %rd27, %rd35;
+	.loc	1 31 35
+	mov.u32 %r10, 0x0;
+	mov.u32 %r11, 0x0;
+	mov.u32 %r12, 0x0;
+	mov.u32 %r13, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ];
+	@!%p1 mov.u32 %r10, %r4;
+	@!%p1 mov.u32 %r11, %r4;
+	@!%p1 mov.u32 %r12, %r4;
+	@!%p1 mov.u32 %r13, %r4;
+	mov.b32 	%f5, %r10;
+	mov.b32 	%f6, %r11;
+	mov.b32 	%f7, %r12;
+	mov.b32 	%f8, %r13;
+	.loc	1 32 30
+	mul.wide.s32 	%rd36, %r79, 4;
+	add.s64 	%rd3, %rd28, %rd36;
+	.loc	1 32 46
+	mov.u32 %r18, 0x0;
+	mov.u32 %r19, 0x0;
+	mov.u32 %r20, 0x0;
+	mov.u32 %r21, 0x0;
+	@%p1 ld.global.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
+	@!%p1 mov.u32 %r18, %r4;
+	@!%p1 mov.u32 %r19, %r4;
+	@!%p1 mov.u32 %r20, %r4;
+	@!%p1 mov.u32 %r21, %r4;
+	mov.b32 	%f9, %r18;
+	mov.b32 	%f10, %r19;
+	mov.b32 	%f11, %r20;
+	mov.b32 	%f12, %r21;
+	.loc	1 33 30
+	mul.wide.s32 	%rd37, %r1, 4;
+	add.s64 	%rd4, %rd29, %rd37;
+	.loc	1 33 35
+	mov.u32 %r26, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r26 }, [ %rd4 + 0 ];
+	mov.b32 	%f13, %r26;
+	mov.u32 %r27, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r27 }, [ %rd4 + 0 ];
+	mov.u32 %r28, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r28 }, [ %rd4 + 0 ];
+	mov.u32 %r29, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r29 }, [ %rd4 + 0 ];
+	.loc	1 34 31
+	add.s64 	%rd8, %rd30, %rd37;
+	.loc	1 34 36
+	mov.u32 %r55, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r55 }, [ %rd8 + 0 ];
+	mov.b32 	%f14, %r55;
+	mov.u32 %r31, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r31 }, [ %rd8 + 0 ];
+	mov.u32 %r32, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r32 }, [ %rd8 + 0 ];
+	mov.u32 %r33, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r33 }, [ %rd8 + 0 ];
+	.loc	1 35 31
+	mul.wide.s32 	%rd38, %r1, 8;
+	add.s64 	%rd13, %rd31, %rd38;
+	.loc	1 35 36
+	mov.u64 %rd12, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd12 }, [ %rd13 + 0 ];
+	mov.u64 %rd14, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd14 }, [ %rd13 + 0 ];
+	mov.u64 %rd16, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd13 + 0 ];
+	mov.u64 %rd18, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd13 + 0 ];
+	.loc	1 36 35
+	add.s64 	%rd20, %rd25, %rd36;
+	.loc	1 36 51
+	mov.u32 %r34, 0x0;
+	mov.u32 %r35, 0x0;
+	mov.u32 %r36, 0x0;
+	mov.u32 %r37, 0x0;
+	@%p1 ld.global.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd20 + 0 ];
+	@!%p1 mov.u32 %r34, %r4;
+	@!%p1 mov.u32 %r35, %r4;
+	@!%p1 mov.u32 %r36, %r4;
+	@!%p1 mov.u32 %r37, %r4;
+	mov.b32 	%f15, %r34;
+	mov.b32 	%f16, %r35;
+	mov.b32 	%f17, %r36;
+	mov.b32 	%f18, %r37;
+	.loc	1 38 18
+	mul.f32 	%f19, %f1, %f5;
+	mul.f32 	%f20, %f2, %f6;
+	mul.f32 	%f21, %f3, %f7;
+	mul.f32 	%f22, %f4, %f8;
+$L__tmp1:
+	.loc	2 233 15
+	fma.rn.f32 	%f23, %f1, %f5, %f20;
+	fma.rn.f32 	%f24, %f3, %f7, %f23;
+	fma.rn.f32 	%f25, %f4, %f8, %f24;
+$L__tmp2:
+	.loc	2 243 36
+	mov.b32 	%r80, %f25;
+	shfl.sync.bfly.b32	%r81, %r80, 16, 31, -1;
+	mov.b32 	%f26, %r81;
+$L__tmp3:
+	.loc	2 233 15
+	add.f32 	%f27, %f25, %f26;
+$L__tmp4:
+	.loc	2 243 36
+	mov.b32 	%r82, %f27;
+	shfl.sync.bfly.b32	%r83, %r82, 8, 31, -1;
+	mov.b32 	%f28, %r83;
+$L__tmp5:
+	.loc	2 233 15
+	add.f32 	%f29, %f27, %f28;
+$L__tmp6:
+	.loc	2 243 36
+	mov.b32 	%r84, %f29;
+	shfl.sync.bfly.b32	%r85, %r84, 4, 31, -1;
+	mov.b32 	%f30, %r85;
+$L__tmp7:
+	.loc	2 233 15
+	add.f32 	%f31, %f29, %f30;
+$L__tmp8:
+	.loc	2 243 36
+	mov.b32 	%r86, %f31;
+	shfl.sync.bfly.b32	%r87, %r86, 2, 31, -1;
+	mov.b32 	%f32, %r87;
+$L__tmp9:
+	.loc	2 233 15
+	add.f32 	%f33, %f31, %f32;
+$L__tmp10:
+	.loc	2 243 36
+	mov.b32 	%r88, %f33;
+	shfl.sync.bfly.b32	%r89, %r88, 1, 31, -1;
+	mov.b32 	%f34, %r89;
+$L__tmp11:
+	.loc	2 233 15
+	add.f32 	%f35, %f33, %f34;
+$L__tmp12:
+	.loc	2 243 36
+	setp.eq.s32 	%p31, %r75, 0;
+	shr.u32 	%r90, %r74, 3;
+	and.b32  	%r91, %r90, 4;
+	mov.u32 	%r92, global_smem;
+	add.s32 	%r42, %r92, %r91;
+	mov.b32 	%r43, %f35;
+	@%p31 st.shared.b32 [ %r42 + 0 ], %r43;
+	bar.sync 	0;
+	setp.lt.s32 	%p32, %r74, 2;
+	add.s32 	%r45, %r92, %r76;
+	@%p32 ld.shared.b32 %r44, [ %r45 + 0 ];
+	mov.b32 	%f36, %r44;
+	shfl.sync.bfly.b32	%r93, %r44, 1, 31, -1;
+	mov.b32 	%f37, %r93;
+$L__tmp13:
+	.loc	2 233 15
+	add.f32 	%f38, %f36, %f37;
+$L__tmp14:
+	.loc	2 243 36
+	and.b32  	%r94, %r74, 1;
+	setp.eq.b32 	%p41, %r94, 1;
+	not.pred 	%p42, %p41;
+	and.pred  	%p33, %p32, %p42;
+	mov.b32 	%r47, %f38;
+	@%p33 st.shared.b32 [ %r45 + 0 ], %r47;
+	bar.sync 	0;
+	ld.shared.f32 	%f39, [global_smem];
+$L__tmp15:
+	.loc	3 8 15
+	add.f32 	%f40, %f39, 0f00000000;
+$L__tmp16:
+	.loc	1 42 19
+	sub.f32 	%f41, %f9, %f13;
+	sub.f32 	%f42, %f10, %f13;
+	sub.f32 	%f43, %f11, %f13;
+	sub.f32 	%f44, %f12, %f13;
+	.loc	1 43 20
+	mul.f32 	%f45, %f41, %f14;
+	mul.f32 	%f46, %f42, %f14;
+	mul.f32 	%f47, %f43, %f14;
+	mul.f32 	%f48, %f44, %f14;
+	.loc	1 44 19
+	mul.f32 	%f49, %f20, %f46;
+$L__tmp17:
+	.loc	2 243 36
+	bar.sync 	0;
+$L__tmp18:
+	.loc	2 233 15
+	fma.rn.f32 	%f50, %f19, %f45, %f49;
+	fma.rn.f32 	%f51, %f21, %f47, %f50;
+	fma.rn.f32 	%f52, %f22, %f48, %f51;
+$L__tmp19:
+	.loc	2 243 36
+	mov.b32 	%r95, %f52;
+	shfl.sync.bfly.b32	%r96, %r95, 16, 31, -1;
+	mov.b32 	%f53, %r96;
+$L__tmp20:
+	.loc	2 233 15
+	add.f32 	%f54, %f52, %f53;
+$L__tmp21:
+	.loc	2 243 36
+	mov.b32 	%r97, %f54;
+	shfl.sync.bfly.b32	%r98, %r97, 8, 31, -1;
+	mov.b32 	%f55, %r98;
+$L__tmp22:
+	.loc	2 233 15
+	add.f32 	%f56, %f54, %f55;
+$L__tmp23:
+	.loc	2 243 36
+	mov.b32 	%r99, %f56;
+	shfl.sync.bfly.b32	%r100, %r99, 4, 31, -1;
+	mov.b32 	%f57, %r100;
+$L__tmp24:
+	.loc	2 233 15
+	add.f32 	%f58, %f56, %f57;
+$L__tmp25:
+	.loc	2 243 36
+	mov.b32 	%r101, %f58;
+	shfl.sync.bfly.b32	%r102, %r101, 2, 31, -1;
+	mov.b32 	%f59, %r102;
+$L__tmp26:
+	.loc	2 233 15
+	add.f32 	%f60, %f58, %f59;
+$L__tmp27:
+	.loc	2 243 36
+	mov.b32 	%r103, %f60;
+	shfl.sync.bfly.b32	%r104, %r103, 1, 31, -1;
+	mov.b32 	%f61, %r104;
+$L__tmp28:
+	.loc	2 233 15
+	add.f32 	%f62, %f60, %f61;
+$L__tmp29:
+	.loc	2 243 36
+	mov.b32 	%r49, %f62;
+	@%p31 st.shared.b32 [ %r42 + 0 ], %r49;
+	bar.sync 	0;
+	@%p32 ld.shared.b32 %r50, [ %r45 + 0 ];
+	mov.b32 	%f63, %r50;
+	shfl.sync.bfly.b32	%r105, %r50, 1, 31, -1;
+	mov.b32 	%f64, %r105;
+$L__tmp30:
+	.loc	2 233 15
+	add.f32 	%f65, %f63, %f64;
+$L__tmp31:
+	.loc	2 243 36
+	mov.b32 	%r53, %f65;
+	@%p33 st.shared.b32 [ %r45 + 0 ], %r53;
+	bar.sync 	0;
+	ld.shared.f32 	%f66, [global_smem];
+$L__tmp32:
+	.loc	3 8 15
+	add.f32 	%f67, %f66, 0f00000000;
+$L__tmp33:
+	.loc	1 49 21
+	setp.eq.s64 	%p43, %rd12, -1;
+	mov.b32 	%r56, 1132462080;
+	.loc	1 51 20
+	div.full.f32 %r54, %r55, %r56;
+	mov.b32 	%f68, %r54;
+	.loc	1 53 20
+	neg.f32 	%f69, %f40;
+	fma.rn.f32 	%f70, %f19, 0f43800000, %f69;
+	fma.rn.f32 	%f71, %f20, 0f43800000, %f69;
+	fma.rn.f32 	%f72, %f21, 0f43800000, %f69;
+	fma.rn.f32 	%f73, %f22, 0f43800000, %f69;
+	.loc	1 55 20
+	neg.f32 	%f74, %f45;
+	fma.rn.f32 	%f75, %f74, %f67, %f70;
+	neg.f32 	%f76, %f46;
+	fma.rn.f32 	%f77, %f76, %f67, %f71;
+	neg.f32 	%f78, %f47;
+	fma.rn.f32 	%f79, %f78, %f67, %f72;
+	neg.f32 	%f80, %f48;
+	fma.rn.f32 	%f81, %f80, %f67, %f73;
+	.loc	1 57 20
+	fma.rn.f32 	%f82, %f68, %f75, %f15;
+	fma.rn.f32 	%f83, %f68, %f77, %f16;
+	fma.rn.f32 	%f84, %f68, %f79, %f17;
+	fma.rn.f32 	%f85, %f68, %f81, %f18;
+	.loc	1 59 35
+	selp.f32 	%f86, 0f00000000, %f82, %p43;
+	selp.f32 	%f87, 0f00000000, %f83, %p43;
+	selp.f32 	%f88, 0f00000000, %f84, %p43;
+	selp.f32 	%f89, 0f00000000, %f85, %p43;
+	.loc	1 61 20
+	setp.lt.s64 	%p44, %rd12, 0;
+	.loc	1 63 56
+	shl.b64 	%rd39, %rd12, 8;
+	add.s64 	%rd40, %rd39, 12865792;
+	selp.b64 	%rd41, %rd40, %rd39, %p44;
+	.loc	1 63 52
+	or.b64  	%rd42, %rd41, %rd34;
+	.loc	1 63 30
+	shl.b64 	%rd43, %rd42, 2;
+	add.s64 	%rd21, %rd32, %rd43;
+	add.s64 	%rd22, %rd21, 4;
+	add.s64 	%rd23, %rd21, 8;
+	add.s64 	%rd24, %rd21, 12;
+	.loc	1 63 83
+	mov.b32 	%r67, %f86;
+	mov.u32 %r66, 0x0;
+	@%p1 atom.global.gpu.acq_rel.add.f32 %r66, [ %rd21 + 0 ], %r67;
+	mov.b32 	%r69, %f87;
+	mov.u32 %r68, 0x0;
+	@%p1 atom.global.gpu.acq_rel.add.f32 %r68, [ %rd22 + 0 ], %r69;
+	mov.b32 	%r71, %f88;
+	mov.u32 %r70, 0x0;
+	@%p1 atom.global.gpu.acq_rel.add.f32 %r70, [ %rd23 + 0 ], %r71;
+	mov.b32 	%r73, %f89;
+	mov.u32 %r72, 0x0;
+	@%p1 atom.global.gpu.acq_rel.add.f32 %r72, [ %rd24 + 0 ], %r73;
+	.loc	1 63 4
+	ret;
+$L__tmp34:
+$L__func_end0:
+}
+	.file	1 "/tmp/torchinductor_root/qr/cqryxm46jcxyr3qdktqirn53eap7h3pjjqiqavyqqyvflabjpvmd.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
+	.file	3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 407
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 113
+.b8 114
+.b8 121
+.b8 120
+.b8 109
+.b8 52
+.b8 54
+.b8 106
+.b8 99
+.b8 120
+.b8 121
+.b8 114
+.b8 51
+.b8 113
+.b8 100
+.b8 107
+.b8 116
+.b8 113
+.b8 105
+.b8 114
+.b8 110
+.b8 53
+.b8 51
+.b8 101
+.b8 97
+.b8 112
+.b8 55
+.b8 104
+.b8 51
+.b8 112
+.b8 106
+.b8 106
+.b8 113
+.b8 105
+.b8 113
+.b8 97
+.b8 118
+.b8 121
+.b8 113
+.b8 113
+.b8 121
+.b8 118
+.b8 102
+.b8 108
+.b8 97
+.b8 98
+.b8 106
+.b8 112
+.b8 118
+.b8 109
+.b8 100
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 113
+.b8 114
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 56
+.b8 100
+.b8 101
+.b8 57
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 56
+.b8 100
+.b8 101
+.b8 57
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 41
+.b8 57
+.b8 5
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp15
+.b8 2
+.b8 41
+.b8 57
+.b8 5
+.b32 125
+.b64 $L__tmp15
+.b64 $L__tmp16
+.b8 3
+.b8 41
+.b8 44
+.b8 5
+.b32 125
+.b64 $L__tmp17
+.b64 $L__tmp32
+.b8 2
+.b8 47
+.b8 59
+.b8 4
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 47
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp32
+.b64 $L__tmp33
+.b8 3
+.b8 47
+.b8 45
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 411
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 56
+.b8 100
+.b8 101
+.b8 57
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 411
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}

.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,88 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
+    %cst_0 = arith.constant dense<-1> : tensor<1xi64, #blocked>
+    %cst_1 = arith.constant dense<2.560000e+02> : tensor<1xf32, #blocked>
+    %cst_2 = arith.constant dense<256> : tensor<1xi64, #blocked>
+    %cst_3 = arith.constant dense<0> : tensor<1xi64, #blocked>
+    %cst_4 = arith.constant dense<50257> : tensor<1xi64, #blocked>
+    %cst_5 = arith.constant 0.000000e+00 : f32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_6 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
+    %cst_7 = arith.constant dense<2.560000e+02> : tensor<256xf32, #blocked>
+    %cst_8 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
+    %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
+    %3 = arith.muli %0, %c256_i32 : i32
+    %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
+    %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
+    %6 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %8 = tt.load %7, %2, %cst_8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %9 = arith.extf %8 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %10 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %11 = tt.addptr %10, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %12 = tt.load %11, %2, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %15 = tt.load %14, %2, %cst_6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %16 = tt.addptr %arg4, %0 : !tt.ptr<f32, 1>, i32
+    %17 = tt.splat %16 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked>
+    %18 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked>
+    %19 = tt.addptr %arg5, %0 : !tt.ptr<f32, 1>, i32
+    %20 = tt.splat %19 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked>
+    %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked>
+    %22 = tt.addptr %arg6, %0 : !tt.ptr<i64, 1>, i32
+    %23 = tt.splat %22 : (!tt.ptr<i64, 1>) -> tensor<1x!tt.ptr<i64, 1>, #blocked>
+    %24 = tt.load %23 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64, #blocked>
+    %25 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %26 = tt.addptr %25, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %27 = tt.load %26, %2, %cst_6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %28 = arith.mulf %9, %12 : tensor<256xf32, #blocked>
+    %29 = arith.select %2, %28, %cst_6 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({
+    ^bb0(%arg10: f32, %arg11: f32):
+      %63 = arith.addf %arg10, %arg11 : f32
+      tt.reduce.return %63 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %31 = arith.addf %30, %cst_5 : f32
+    %32 = tt.broadcast %18 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
+    %33 = arith.subf %15, %32 : tensor<256xf32, #blocked>
+    %34 = tt.broadcast %21 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
+    %35 = arith.mulf %33, %34 : tensor<256xf32, #blocked>
+    %36 = arith.mulf %28, %35 : tensor<256xf32, #blocked>
+    %37 = arith.select %2, %36, %cst_6 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %38 = "tt.reduce"(%37) <{axis = 0 : i32}> ({
+    ^bb0(%arg10: f32, %arg11: f32):
+      %63 = arith.addf %arg10, %arg11 : f32
+      tt.reduce.return %63 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %39 = arith.addf %38, %cst_5 : f32
+    %40 = arith.cmpi eq, %24, %cst_0 : tensor<1xi64, #blocked>
+    %41 = arith.divf %21, %cst_1 : tensor<1xf32, #blocked>
+    %42 = arith.mulf %28, %cst_7 : tensor<256xf32, #blocked>
+    %43 = tt.splat %31 : (f32) -> tensor<256xf32, #blocked>
+    %44 = arith.subf %42, %43 : tensor<256xf32, #blocked>
+    %45 = tt.splat %39 : (f32) -> tensor<256xf32, #blocked>
+    %46 = arith.mulf %35, %45 : tensor<256xf32, #blocked>
+    %47 = arith.subf %44, %46 : tensor<256xf32, #blocked>
+    %48 = tt.broadcast %41 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
+    %49 = arith.mulf %48, %47 : tensor<256xf32, #blocked>
+    %50 = arith.addf %27, %49 : tensor<256xf32, #blocked>
+    %51 = tt.broadcast %40 : (tensor<1xi1, #blocked>) -> tensor<256xi1, #blocked>
+    %52 = arith.select %51, %cst_6, %50 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %53 = arith.addi %24, %cst_4 : tensor<1xi64, #blocked>
+    %54 = arith.cmpi slt, %24, %cst_3 : tensor<1xi64, #blocked>
+    %55 = arith.select %54, %53, %24 : tensor<1xi1, #blocked>, tensor<1xi64, #blocked>
+    %56 = arith.muli %55, %cst_2 : tensor<1xi64, #blocked>
+    %57 = tt.broadcast %56 : (tensor<1xi64, #blocked>) -> tensor<256xi64, #blocked>
+    %58 = arith.extsi %1 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked>
+    %59 = arith.addi %58, %57 : tensor<256xi64, #blocked>
+    %60 = tt.splat %arg7 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %61 = tt.addptr %60, %59 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi64, #blocked>
+    %62 = "tt.atomic_rmw"(%61, %52, %2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xf32, #blocked>, tensor<256xi1, #blocked>) -> tensor<256xf32, #blocked>
+    tt.return
+  }
+}

.triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.llir ADDED Viewed

	@@ -0,0 +1,980 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
+@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
+define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %9 = lshr i32 %8, 5, !dbg !10
+  %10 = and i32 %9, 7, !dbg !10
+  %11 = and i32 %8, 15, !dbg !10
+  %12 = shl i32 %8, 3, !dbg !11
+  %13 = and i32 %12, 248, !dbg !11
+  %14 = or i32 %13, 4, !dbg !11
+  %urem = and i32 %8, 255, !dbg !11
+  %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
+  %16 = shl i32 %15, 4, !dbg !13
+  %17 = or i32 %16, %10, !dbg !14
+  %18 = or i32 %17, 8, !dbg !14
+  %19 = or i32 %16, %11, !dbg !14
+  %20 = sext i32 %17 to i64, !dbg !15
+  %21 = getelementptr i64, ptr addrspace(1) %0, i64 %20, !dbg !15
+  %22 = sext i32 %18 to i64, !dbg !15
+  %23 = getelementptr i64, ptr addrspace(1) %0, i64 %22, !dbg !15
+  %24 = sext i32 %19 to i64, !dbg !15
+  %25 = getelementptr i64, ptr addrspace(1) %0, i64 %24, !dbg !15
+  %26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
+  %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
+  %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
+  %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
+  %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
+  %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
+  %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
+  %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
+  %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
+  %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
+  %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
+  %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
+  %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
+  %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
+  %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
+  %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
+  %42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !16
+  %43 = srem i32 %17, 512, !dbg !17
+  %44 = srem i32 %18, 512, !dbg !17
+  %45 = shl nsw i32 %43, 8, !dbg !18
+  %46 = shl nsw i32 %44, 8, !dbg !18
+  %47 = or i32 %45, %13, !dbg !19
+  %48 = or i32 %45, %14, !dbg !19
+  %49 = or i32 %46, %13, !dbg !19
+  %50 = or i32 %46, %14, !dbg !19
+  %51 = sext i32 %47 to i64, !dbg !20
+  %52 = getelementptr float, ptr addrspace(1) %2, i64 %51, !dbg !20
+  %53 = sext i32 %48 to i64, !dbg !20
+  %54 = getelementptr float, ptr addrspace(1) %2, i64 %53, !dbg !20
+  %55 = sext i32 %49 to i64, !dbg !20
+  %56 = getelementptr float, ptr addrspace(1) %2, i64 %55, !dbg !20
+  %57 = sext i32 %50 to i64, !dbg !20
+  %58 = getelementptr float, ptr addrspace(1) %2, i64 %57, !dbg !20
+  %59 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %52, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
+  %60 = extractvalue { i32, i32, i32, i32 } %59, 0, !dbg !21
+  %61 = extractvalue { i32, i32, i32, i32 } %59, 1, !dbg !21
+  %62 = extractvalue { i32, i32, i32, i32 } %59, 2, !dbg !21
+  %63 = extractvalue { i32, i32, i32, i32 } %59, 3, !dbg !21
+  %64 = bitcast i32 %60 to float, !dbg !21
+  %65 = bitcast i32 %61 to float, !dbg !21
+  %66 = bitcast i32 %62 to float, !dbg !21
+  %67 = bitcast i32 %63 to float, !dbg !21
+  %68 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %54, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
+  %69 = extractvalue { i32, i32, i32, i32 } %68, 0, !dbg !21
+  %70 = extractvalue { i32, i32, i32, i32 } %68, 1, !dbg !21
+  %71 = extractvalue { i32, i32, i32, i32 } %68, 2, !dbg !21
+  %72 = extractvalue { i32, i32, i32, i32 } %68, 3, !dbg !21
+  %73 = bitcast i32 %69 to float, !dbg !21
+  %74 = bitcast i32 %70 to float, !dbg !21
+  %75 = bitcast i32 %71 to float, !dbg !21
+  %76 = bitcast i32 %72 to float, !dbg !21
+  %77 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %56, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
+  %78 = extractvalue { i32, i32, i32, i32 } %77, 0, !dbg !21
+  %79 = extractvalue { i32, i32, i32, i32 } %77, 1, !dbg !21
+  %80 = extractvalue { i32, i32, i32, i32 } %77, 2, !dbg !21
+  %81 = extractvalue { i32, i32, i32, i32 } %77, 3, !dbg !21
+  %82 = bitcast i32 %78 to float, !dbg !21
+  %83 = bitcast i32 %79 to float, !dbg !21
+  %84 = bitcast i32 %80 to float, !dbg !21
+  %85 = bitcast i32 %81 to float, !dbg !21
+  %86 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %58, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
+  %87 = extractvalue { i32, i32, i32, i32 } %86, 0, !dbg !21
+  %88 = extractvalue { i32, i32, i32, i32 } %86, 1, !dbg !21
+  %89 = extractvalue { i32, i32, i32, i32 } %86, 2, !dbg !21
+  %90 = extractvalue { i32, i32, i32, i32 } %86, 3, !dbg !21
+  %91 = bitcast i32 %87 to float, !dbg !21
+  %92 = bitcast i32 %88 to float, !dbg !21
+  %93 = bitcast i32 %89 to float, !dbg !21
+  %94 = bitcast i32 %90 to float, !dbg !21
+  %95 = add i64 %42, 50257, !dbg !22
+  %96 = icmp slt i64 %26, 0, !dbg !23
+  %97 = icmp slt i64 %34, 0, !dbg !23
+  %98 = icmp slt i64 %42, 0, !dbg !23
+  %99 = select i1 %98, i64 %95, i64 %42, !dbg !24
+  %100 = icmp ugt i64 %99, 50256, !dbg !25
+  br i1 %100, label %101, label %102, !dbg !26
+101:                                              ; preds = %7
+  tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !26
+  br label %102, !dbg !26
+102:                                              ; preds = %101, %7
+  %103 = shl i64 %26, 8, !dbg !27
+  %104 = add i64 %103, 12865792, !dbg !27
+  %105 = select i1 %96, i64 %104, i64 %103, !dbg !27
+  %106 = shl i64 %34, 8, !dbg !27
+  %107 = add i64 %106, 12865792, !dbg !27
+  %108 = select i1 %97, i64 %107, i64 %106, !dbg !27
+  %109 = zext nneg i32 %13 to i64
+  %110 = zext nneg i32 %14 to i64
+  %111 = or i64 %105, %109, !dbg !28
+  %112 = or i64 %105, %110, !dbg !28
+  %113 = or i64 %108, %109, !dbg !28
+  %114 = or i64 %108, %110, !dbg !28
+  %115 = getelementptr float, ptr addrspace(1) %1, i64 %111, !dbg !29
+  %116 = getelementptr float, ptr addrspace(1) %1, i64 %112, !dbg !29
+  %117 = getelementptr float, ptr addrspace(1) %1, i64 %113, !dbg !29
+  %118 = getelementptr float, ptr addrspace(1) %1, i64 %114, !dbg !29
+  %119 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %115, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !30
+  %120 = extractvalue { i32, i32, i32, i32 } %119, 0, !dbg !30
+  %121 = extractvalue { i32, i32, i32, i32 } %119, 1, !dbg !30
+  %122 = extractvalue { i32, i32, i32, i32 } %119, 2, !dbg !30
+  %123 = extractvalue { i32, i32, i32, i32 } %119, 3, !dbg !30
+  %124 = bitcast i32 %120 to float, !dbg !30
+  %125 = bitcast i32 %121 to float, !dbg !30
+  %126 = bitcast i32 %122 to float, !dbg !30
+  %127 = bitcast i32 %123 to float, !dbg !30
+  %128 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %116, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !30
+  %129 = extractvalue { i32, i32, i32, i32 } %128, 0, !dbg !30
+  %130 = extractvalue { i32, i32, i32, i32 } %128, 1, !dbg !30
+  %131 = extractvalue { i32, i32, i32, i32 } %128, 2, !dbg !30
+  %132 = extractvalue { i32, i32, i32, i32 } %128, 3, !dbg !30
+  %133 = bitcast i32 %129 to float, !dbg !30
+  %134 = bitcast i32 %130 to float, !dbg !30
+  %135 = bitcast i32 %131 to float, !dbg !30
+  %136 = bitcast i32 %132 to float, !dbg !30
+  %137 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %117, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !30
+  %138 = extractvalue { i32, i32, i32, i32 } %137, 0, !dbg !30
+  %139 = extractvalue { i32, i32, i32, i32 } %137, 1, !dbg !30
+  %140 = extractvalue { i32, i32, i32, i32 } %137, 2, !dbg !30
+  %141 = extractvalue { i32, i32, i32, i32 } %137, 3, !dbg !30
+  %142 = bitcast i32 %138 to float, !dbg !30
+  %143 = bitcast i32 %139 to float, !dbg !30
+  %144 = bitcast i32 %140 to float, !dbg !30
+  %145 = bitcast i32 %141 to float, !dbg !30
+  %146 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %118, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !30
+  %147 = extractvalue { i32, i32, i32, i32 } %146, 0, !dbg !30
+  %148 = extractvalue { i32, i32, i32, i32 } %146, 1, !dbg !30
+  %149 = extractvalue { i32, i32, i32, i32 } %146, 2, !dbg !30
+  %150 = extractvalue { i32, i32, i32, i32 } %146, 3, !dbg !30
+  %151 = bitcast i32 %147 to float, !dbg !30
+  %152 = bitcast i32 %148 to float, !dbg !30
+  %153 = bitcast i32 %149 to float, !dbg !30
+  %154 = bitcast i32 %150 to float, !dbg !30
+  %155 = fadd float %64, %124, !dbg !31
+  %156 = fadd float %65, %125, !dbg !31
+  %157 = fadd float %66, %126, !dbg !31
+  %158 = fadd float %67, %127, !dbg !31
+  %159 = fadd float %73, %133, !dbg !31
+  %160 = fadd float %74, %134, !dbg !31
+  %161 = fadd float %75, %135, !dbg !31
+  %162 = fadd float %76, %136, !dbg !31
+  %163 = fadd float %82, %142, !dbg !31
+  %164 = fadd float %83, %143, !dbg !31
+  %165 = fadd float %84, %144, !dbg !31
+  %166 = fadd float %85, %145, !dbg !31
+  %167 = fadd float %91, %151, !dbg !31
+  %168 = fadd float %92, %152, !dbg !31
+  %169 = fadd float %93, %153, !dbg !31
+  %170 = fadd float %94, %154, !dbg !31
+  %171 = fadd float %155, 0.000000e+00, !dbg !32
+  %172 = fadd float %156, 0.000000e+00, !dbg !32
+  %173 = fadd float %157, 0.000000e+00, !dbg !32
+  %174 = fadd float %158, 0.000000e+00, !dbg !32
+  %175 = fadd float %159, 0.000000e+00, !dbg !32
+  %176 = fadd float %160, 0.000000e+00, !dbg !32
+  %177 = fadd float %161, 0.000000e+00, !dbg !32
+  %178 = fadd float %162, 0.000000e+00, !dbg !32
+  %179 = fadd float %163, 0.000000e+00, !dbg !32
+  %180 = fadd float %164, 0.000000e+00, !dbg !32
+  %181 = fadd float %165, 0.000000e+00, !dbg !32
+  %182 = fadd float %166, 0.000000e+00, !dbg !32
+  %183 = fadd float %167, 0.000000e+00, !dbg !32
+  %184 = fadd float %168, 0.000000e+00, !dbg !32
+  %185 = fadd float %169, 0.000000e+00, !dbg !32
+  %186 = fadd float %170, 0.000000e+00, !dbg !32
+  %187 = fsub float %155, %171, !dbg !36
+  %188 = fsub float %156, %172, !dbg !36
+  %189 = fsub float %157, %173, !dbg !36
+  %190 = fsub float %158, %174, !dbg !36
+  %191 = fsub float %159, %175, !dbg !36
+  %192 = fsub float %160, %176, !dbg !36
+  %193 = fsub float %161, %177, !dbg !36
+  %194 = fsub float %162, %178, !dbg !36
+  %195 = fsub float %163, %179, !dbg !36
+  %196 = fsub float %164, %180, !dbg !36
+  %197 = fsub float %165, %181, !dbg !36
+  %198 = fsub float %166, %182, !dbg !36
+  %199 = fsub float %167, %183, !dbg !36
+  %200 = fsub float %168, %184, !dbg !36
+  %201 = fsub float %169, %185, !dbg !36
+  %202 = fsub float %170, %186, !dbg !36
+  %203 = fmul float %155, %187, !dbg !37
+  %204 = fmul float %156, %188, !dbg !37
+  %205 = fmul float %157, %189, !dbg !37
+  %206 = fmul float %158, %190, !dbg !37
+  %207 = fmul float %159, %191, !dbg !37
+  %208 = fmul float %160, %192, !dbg !37
+  %209 = fmul float %161, %193, !dbg !37
+  %210 = fmul float %162, %194, !dbg !37
+  %211 = fmul float %163, %195, !dbg !37
+  %212 = fmul float %164, %196, !dbg !37
+  %213 = fmul float %165, %197, !dbg !37
+  %214 = fmul float %166, %198, !dbg !37
+  %215 = fmul float %167, %199, !dbg !37
+  %216 = fmul float %168, %200, !dbg !37
+  %217 = fmul float %169, %201, !dbg !37
+  %218 = fmul float %170, %202, !dbg !37
+  %219 = fadd float %203, 0.000000e+00, !dbg !38
+  %220 = fadd float %204, 0.000000e+00, !dbg !38
+  %221 = fadd float %205, 0.000000e+00, !dbg !38
+  %222 = fadd float %206, 0.000000e+00, !dbg !38
+  %223 = fadd float %207, 0.000000e+00, !dbg !38
+  %224 = fadd float %208, 0.000000e+00, !dbg !38
+  %225 = fadd float %209, 0.000000e+00, !dbg !38
+  %226 = fadd float %210, 0.000000e+00, !dbg !38
+  %227 = fadd float %211, 0.000000e+00, !dbg !38
+  %228 = fadd float %212, 0.000000e+00, !dbg !38
+  %229 = fadd float %213, 0.000000e+00, !dbg !38
+  %230 = fadd float %214, 0.000000e+00, !dbg !38
+  %231 = fadd float %215, 0.000000e+00, !dbg !38
+  %232 = fadd float %216, 0.000000e+00, !dbg !38
+  %233 = fadd float %217, 0.000000e+00, !dbg !38
+  %234 = fadd float %218, 0.000000e+00, !dbg !38
+  %235 = fsub float %172, %171, !dbg !39
+  %236 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !43
+  %237 = fmul float %236, %235, !dbg !44
+  %238 = fadd float %171, %237, !dbg !45
+  %239 = fadd float %219, %220, !dbg !46
+  %240 = fmul float %235, %235, !dbg !47
+  %241 = fmul float %236, %240, !dbg !48
+  %242 = fadd float %241, %239, !dbg !49
+  %243 = fsub float %173, %238, !dbg !39
+  %244 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !43
+  %245 = fmul float %244, %243, !dbg !44
+  %246 = fadd float %238, %245, !dbg !45
+  %247 = fadd float %221, %242, !dbg !46
+  %248 = fmul float %243, %243, !dbg !47
+  %249 = fmul float %248, 2.000000e+00, !dbg !50
+  %250 = fmul float %244, %249, !dbg !48
+  %251 = fadd float %247, %250, !dbg !49
+  %252 = fsub float %174, %246, !dbg !39
+  %253 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !43
+  %254 = fmul float %253, %252, !dbg !44
+  %255 = fadd float %246, %254, !dbg !45
+  %256 = fadd float %222, %251, !dbg !46
+  %257 = fmul float %252, %252, !dbg !47
+  %258 = fmul float %257, 3.000000e+00, !dbg !50
+  %259 = fmul float %253, %258, !dbg !48
+  %260 = fadd float %256, %259, !dbg !49
+  %261 = fsub float %175, %255, !dbg !39
+  %262 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 5.000000e+00) #6, !dbg !43
+  %263 = fmul float %262, %261, !dbg !44
+  %264 = fadd float %255, %263, !dbg !45
+  %265 = fadd float %223, %260, !dbg !46
+  %266 = fmul float %261, %261, !dbg !47
+  %267 = fmul float %266, 4.000000e+00, !dbg !50
+  %268 = fmul float %262, %267, !dbg !48
+  %269 = fadd float %265, %268, !dbg !49
+  %270 = fsub float %176, %264, !dbg !39
+  %271 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 6.000000e+00) #6, !dbg !43
+  %272 = fmul float %271, %270, !dbg !44
+  %273 = fadd float %264, %272, !dbg !45
+  %274 = fadd float %224, %269, !dbg !46
+  %275 = fmul float %270, %270, !dbg !47
+  %276 = fmul float %275, 5.000000e+00, !dbg !50
+  %277 = fmul float %271, %276, !dbg !48
+  %278 = fadd float %274, %277, !dbg !49
+  %279 = fsub float %177, %273, !dbg !39
+  %280 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 7.000000e+00) #6, !dbg !43
+  %281 = fmul float %280, %279, !dbg !44
+  %282 = fadd float %273, %281, !dbg !45
+  %283 = fadd float %225, %278, !dbg !46
+  %284 = fmul float %279, %279, !dbg !47
+  %285 = fmul float %284, 6.000000e+00, !dbg !50
+  %286 = fmul float %280, %285, !dbg !48
+  %287 = fadd float %283, %286, !dbg !49
+  %288 = fsub float %178, %282, !dbg !39
+  %289 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 8.000000e+00) #6, !dbg !43
+  %290 = fmul float %289, %288, !dbg !44
+  %291 = fadd float %282, %290, !dbg !45
+  %292 = fadd float %226, %287, !dbg !46
+  %293 = fmul float %288, %288, !dbg !47
+  %294 = fmul float %293, 7.000000e+00, !dbg !50
+  %295 = fmul float %289, %294, !dbg !48
+  %296 = fadd float %292, %295, !dbg !49
+  %297 = fsub float %180, %179, !dbg !39
+  %298 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !43
+  %299 = fmul float %297, %298, !dbg !44
+  %300 = fadd float %179, %299, !dbg !45
+  %301 = fadd float %227, %228, !dbg !46
+  %302 = fmul float %297, %297, !dbg !47
+  %303 = fmul float %302, %298, !dbg !48
+  %304 = fadd float %301, %303, !dbg !49
+  %305 = fsub float %181, %300, !dbg !39
+  %306 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !43
+  %307 = fmul float %306, %305, !dbg !44
+  %308 = fadd float %300, %307, !dbg !45
+  %309 = fadd float %229, %304, !dbg !46
+  %310 = fmul float %305, %305, !dbg !47
+  %311 = fmul float %310, 2.000000e+00, !dbg !50
+  %312 = fmul float %306, %311, !dbg !48
+  %313 = fadd float %309, %312, !dbg !49
+  %314 = fsub float %182, %308, !dbg !39
+  %315 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !43
+  %316 = fmul float %315, %314, !dbg !44
+  %317 = fadd float %308, %316, !dbg !45
+  %318 = fadd float %230, %313, !dbg !46
+  %319 = fmul float %314, %314, !dbg !47
+  %320 = fmul float %319, 3.000000e+00, !dbg !50
+  %321 = fmul float %315, %320, !dbg !48
+  %322 = fadd float %318, %321, !dbg !49
+  %323 = fsub float %183, %317, !dbg !39
+  %324 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 5.000000e+00) #6, !dbg !43
+  %325 = fmul float %324, %323, !dbg !44
+  %326 = fadd float %317, %325, !dbg !45
+  %327 = fadd float %231, %322, !dbg !46
+  %328 = fmul float %323, %323, !dbg !47
+  %329 = fmul float %328, 4.000000e+00, !dbg !50
+  %330 = fmul float %324, %329, !dbg !48
+  %331 = fadd float %327, %330, !dbg !49
+  %332 = fsub float %184, %326, !dbg !39
+  %333 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 6.000000e+00) #6, !dbg !43
+  %334 = fmul float %333, %332, !dbg !44
+  %335 = fadd float %326, %334, !dbg !45
+  %336 = fadd float %232, %331, !dbg !46
+  %337 = fmul float %332, %332, !dbg !47
+  %338 = fmul float %337, 5.000000e+00, !dbg !50
+  %339 = fmul float %333, %338, !dbg !48
+  %340 = fadd float %336, %339, !dbg !49
+  %341 = fsub float %185, %335, !dbg !39
+  %342 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 7.000000e+00) #6, !dbg !43
+  %343 = fmul float %342, %341, !dbg !44
+  %344 = fadd float %335, %343, !dbg !45
+  %345 = fadd float %233, %340, !dbg !46
+  %346 = fmul float %341, %341, !dbg !47
+  %347 = fmul float %346, 6.000000e+00, !dbg !50
+  %348 = fmul float %342, %347, !dbg !48
+  %349 = fadd float %345, %348, !dbg !49
+  %350 = fsub float %186, %344, !dbg !39
+  %351 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 8.000000e+00) #6, !dbg !43
+  %352 = fmul float %351, %350, !dbg !44
+  %353 = fadd float %344, %352, !dbg !45
+  %354 = fadd float %234, %349, !dbg !46
+  %355 = fmul float %350, %350, !dbg !47
+  %356 = fmul float %355, 7.000000e+00, !dbg !50
+  %357 = fmul float %351, %356, !dbg !48
+  %358 = fadd float %354, %357, !dbg !49
+  %359 = bitcast float %291 to i32, !dbg !51
+  %360 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %359, i32 16, i32 31), !dbg !51
+  %361 = bitcast i32 %360 to float, !dbg !51
+  %362 = bitcast float %296 to i32, !dbg !51
+  %363 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %362, i32 16, i32 31), !dbg !51
+  %364 = bitcast i32 %363 to float, !dbg !51
+  %365 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1090519040, i32 16, i32 31), !dbg !51
+  %366 = bitcast i32 %365 to float, !dbg !51
+  %367 = fsub float %361, %291, !dbg !39
+  %368 = fadd float %366, 8.000000e+00, !dbg !53
+  %369 = fcmp oeq float %368, 0.000000e+00, !dbg !54
+  %370 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %366, float %368) #6, !dbg !43
+  %371 = select i1 %369, float 0.000000e+00, float %370, !dbg !55
+  %372 = fmul float %371, %367, !dbg !44
+  %373 = fadd float %291, %372, !dbg !45
+  %374 = fadd float %296, %364, !dbg !46
+  %375 = fmul float %367, %367, !dbg !47
+  %376 = fmul float %375, 8.000000e+00, !dbg !50
+  %377 = fmul float %371, %376, !dbg !48
+  %378 = fadd float %374, %377, !dbg !49
+  %379 = bitcast float %373 to i32, !dbg !51
+  %380 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %379, i32 8, i32 31), !dbg !51
+  %381 = bitcast i32 %380 to float, !dbg !51
+  %382 = bitcast float %378 to i32, !dbg !51
+  %383 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %382, i32 8, i32 31), !dbg !51
+  %384 = bitcast i32 %383 to float, !dbg !51
+  %385 = bitcast float %368 to i32, !dbg !51
+  %386 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %385, i32 8, i32 31), !dbg !51
+  %387 = bitcast i32 %386 to float, !dbg !51
+  %388 = fsub float %381, %373, !dbg !39
+  %389 = fadd float %368, %387, !dbg !53
+  %390 = fcmp oeq float %389, 0.000000e+00, !dbg !54
+  %391 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %387, float %389) #6, !dbg !43
+  %392 = select i1 %390, float 0.000000e+00, float %391, !dbg !55
+  %393 = fmul float %392, %388, !dbg !44
+  %394 = fadd float %373, %393, !dbg !45
+  %395 = fadd float %378, %384, !dbg !46
+  %396 = fmul float %388, %388, !dbg !47
+  %397 = fmul float %368, %396, !dbg !50
+  %398 = fmul float %392, %397, !dbg !48
+  %399 = fadd float %395, %398, !dbg !49
+  %400 = bitcast float %394 to i32, !dbg !51
+  %401 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %400, i32 4, i32 31), !dbg !51
+  %402 = bitcast i32 %401 to float, !dbg !51
+  %403 = bitcast float %399 to i32, !dbg !51
+  %404 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %403, i32 4, i32 31), !dbg !51
+  %405 = bitcast i32 %404 to float, !dbg !51
+  %406 = bitcast float %389 to i32, !dbg !51
+  %407 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %406, i32 4, i32 31), !dbg !51
+  %408 = bitcast i32 %407 to float, !dbg !51
+  %409 = fsub float %402, %394, !dbg !39
+  %410 = fadd float %389, %408, !dbg !53
+  %411 = fcmp oeq float %410, 0.000000e+00, !dbg !54
+  %412 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %408, float %410) #6, !dbg !43
+  %413 = select i1 %411, float 0.000000e+00, float %412, !dbg !55
+  %414 = fmul float %409, %413, !dbg !44
+  %415 = fadd float %394, %414, !dbg !45
+  %416 = fadd float %399, %405, !dbg !46
+  %417 = fmul float %409, %409, !dbg !47
+  %418 = fmul float %389, %417, !dbg !50
+  %419 = fmul float %413, %418, !dbg !48
+  %420 = fadd float %416, %419, !dbg !49
+  %421 = bitcast float %415 to i32, !dbg !51
+  %422 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %421, i32 2, i32 31), !dbg !51
+  %423 = bitcast i32 %422 to float, !dbg !51
+  %424 = bitcast float %420 to i32, !dbg !51
+  %425 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %424, i32 2, i32 31), !dbg !51
+  %426 = bitcast i32 %425 to float, !dbg !51
+  %427 = bitcast float %410 to i32, !dbg !51
+  %428 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %427, i32 2, i32 31), !dbg !51
+  %429 = bitcast i32 %428 to float, !dbg !51
+  %430 = fsub float %423, %415, !dbg !39
+  %431 = fadd float %410, %429, !dbg !53
+  %432 = fcmp oeq float %431, 0.000000e+00, !dbg !54
+  %433 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %429, float %431) #6, !dbg !43
+  %434 = select i1 %432, float 0.000000e+00, float %433, !dbg !55
+  %435 = fmul float %430, %434, !dbg !44
+  %436 = fadd float %415, %435, !dbg !45
+  %437 = fadd float %420, %426, !dbg !46
+  %438 = fmul float %430, %430, !dbg !47
+  %439 = fmul float %410, %438, !dbg !50
+  %440 = fmul float %434, %439, !dbg !48
+  %441 = fadd float %437, %440, !dbg !49
+  %442 = bitcast float %436 to i32, !dbg !51
+  %443 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %442, i32 1, i32 31), !dbg !51
+  %444 = bitcast float %441 to i32, !dbg !51
+  %445 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %444, i32 1, i32 31), !dbg !51
+  %446 = bitcast float %431 to i32, !dbg !51
+  %447 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %446, i32 1, i32 31), !dbg !51
+  %448 = bitcast i32 %447 to float, !dbg !51
+  %449 = fadd float %431, %448, !dbg !53
+  %450 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %448, float %449) #6, !dbg !43
+  %451 = bitcast float %353 to i32, !dbg !51
+  %452 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %451, i32 16, i32 31), !dbg !51
+  %453 = bitcast i32 %452 to float, !dbg !51
+  %454 = bitcast float %358 to i32, !dbg !51
+  %455 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %454, i32 16, i32 31), !dbg !51
+  %456 = bitcast i32 %455 to float, !dbg !51
+  %457 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1090519040, i32 16, i32 31), !dbg !51
+  %458 = bitcast i32 %457 to float, !dbg !51
+  %459 = fsub float %453, %353, !dbg !39
+  %460 = fadd float %458, 8.000000e+00, !dbg !53
+  %461 = fcmp oeq float %460, 0.000000e+00, !dbg !54
+  %462 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %458, float %460) #6, !dbg !43
+  %463 = select i1 %461, float 0.000000e+00, float %462, !dbg !55
+  %464 = fmul float %459, %463, !dbg !44
+  %465 = fadd float %353, %464, !dbg !45
+  %466 = fadd float %358, %456, !dbg !46
+  %467 = fmul float %459, %459, !dbg !47
+  %468 = fmul float %467, 8.000000e+00, !dbg !50
+  %469 = fmul float %468, %463, !dbg !48
+  %470 = fadd float %466, %469, !dbg !49
+  %471 = bitcast float %465 to i32, !dbg !51
+  %472 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %471, i32 8, i32 31), !dbg !51
+  %473 = bitcast i32 %472 to float, !dbg !51
+  %474 = bitcast float %470 to i32, !dbg !51
+  %475 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %474, i32 8, i32 31), !dbg !51
+  %476 = bitcast i32 %475 to float, !dbg !51
+  %477 = bitcast float %460 to i32, !dbg !51
+  %478 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %477, i32 8, i32 31), !dbg !51
+  %479 = bitcast i32 %478 to float, !dbg !51
+  %480 = fsub float %473, %465, !dbg !39
+  %481 = fadd float %460, %479, !dbg !53
+  %482 = fcmp oeq float %481, 0.000000e+00, !dbg !54
+  %483 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %479, float %481) #6, !dbg !43
+  %484 = select i1 %482, float 0.000000e+00, float %483, !dbg !55
+  %485 = fmul float %480, %484, !dbg !44
+  %486 = fadd float %465, %485, !dbg !45
+  %487 = fadd float %470, %476, !dbg !46
+  %488 = fmul float %480, %480, !dbg !47
+  %489 = fmul float %460, %488, !dbg !50
+  %490 = fmul float %484, %489, !dbg !48
+  %491 = fadd float %487, %490, !dbg !49
+  %492 = bitcast float %486 to i32, !dbg !51
+  %493 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %492, i32 4, i32 31), !dbg !51
+  %494 = bitcast i32 %493 to float, !dbg !51
+  %495 = bitcast float %491 to i32, !dbg !51
+  %496 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %495, i32 4, i32 31), !dbg !51
+  %497 = bitcast i32 %496 to float, !dbg !51
+  %498 = bitcast float %481 to i32, !dbg !51
+  %499 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %498, i32 4, i32 31), !dbg !51
+  %500 = bitcast i32 %499 to float, !dbg !51
+  %501 = fsub float %494, %486, !dbg !39
+  %502 = fadd float %481, %500, !dbg !53
+  %503 = fcmp oeq float %502, 0.000000e+00, !dbg !54
+  %504 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %500, float %502) #6, !dbg !43
+  %505 = select i1 %503, float 0.000000e+00, float %504, !dbg !55
+  %506 = fmul float %501, %505, !dbg !44
+  %507 = fadd float %486, %506, !dbg !45
+  %508 = fadd float %491, %497, !dbg !46
+  %509 = fmul float %501, %501, !dbg !47
+  %510 = fmul float %481, %509, !dbg !50
+  %511 = fmul float %505, %510, !dbg !48
+  %512 = fadd float %508, %511, !dbg !49
+  %513 = bitcast float %507 to i32, !dbg !51
+  %514 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %513, i32 2, i32 31), !dbg !51
+  %515 = bitcast i32 %514 to float, !dbg !51
+  %516 = bitcast float %512 to i32, !dbg !51
+  %517 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %516, i32 2, i32 31), !dbg !51
+  %518 = bitcast i32 %517 to float, !dbg !51
+  %519 = bitcast float %502 to i32, !dbg !51
+  %520 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %519, i32 2, i32 31), !dbg !51
+  %521 = bitcast i32 %520 to float, !dbg !51
+  %522 = fsub float %515, %507, !dbg !39
+  %523 = fadd float %502, %521, !dbg !53
+  %524 = fcmp oeq float %523, 0.000000e+00, !dbg !54
+  %525 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %521, float %523) #6, !dbg !43
+  %526 = select i1 %524, float 0.000000e+00, float %525, !dbg !55
+  %527 = fmul float %522, %526, !dbg !44
+  %528 = fadd float %507, %527, !dbg !45
+  %529 = fadd float %512, %518, !dbg !46
+  %530 = fmul float %522, %522, !dbg !47
+  %531 = fmul float %502, %530, !dbg !50
+  %532 = fmul float %526, %531, !dbg !48
+  %533 = fadd float %529, %532, !dbg !49
+  %534 = bitcast float %528 to i32, !dbg !51
+  %535 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %534, i32 1, i32 31), !dbg !51
+  %536 = bitcast float %533 to i32, !dbg !51
+  %537 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %536, i32 1, i32 31), !dbg !51
+  %538 = bitcast float %523 to i32, !dbg !51
+  %539 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %538, i32 1, i32 31), !dbg !51
+  %540 = bitcast i32 %539 to float, !dbg !51
+  %541 = fadd float %523, %540, !dbg !53
+  %542 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %540, float %541) #6, !dbg !43
+  %543 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %52, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !56
+  %544 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %54, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !56
+  %545 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %56, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !56
+  %546 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %58, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !56
+  %547 = zext nneg i32 %urem to i64, !dbg !57
+  %548 = getelementptr float, ptr addrspace(1) %3, i64 %547, !dbg !57
+  %549 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %548, i1 true, i32 0, i1 true) #6, !dbg !58
+  br i1 %100, label %550, label %551, !dbg !59
+550:                                              ; preds = %102
+  tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !59
+  br label %551, !dbg !59
+551:                                              ; preds = %550, %102
+  %552 = bitcast i32 %537 to float, !dbg !51
+  %553 = fadd float %533, %552, !dbg !46
+  %554 = bitcast i32 %535 to float, !dbg !51
+  %555 = fsub float %554, %528, !dbg !39
+  %556 = fmul float %555, %555, !dbg !47
+  %557 = fmul float %523, %556, !dbg !50
+  %558 = fcmp oeq float %541, 0.000000e+00, !dbg !54
+  %559 = select i1 %558, float 0.000000e+00, float %542, !dbg !55
+  %560 = fmul float %559, %557, !dbg !48
+  %561 = fadd float %553, %560, !dbg !49
+  %562 = bitcast i32 %445 to float, !dbg !51
+  %563 = fadd float %441, %562, !dbg !46
+  %564 = bitcast i32 %443 to float, !dbg !51
+  %565 = fsub float %564, %436, !dbg !39
+  %566 = fmul float %565, %565, !dbg !47
+  %567 = fmul float %431, %566, !dbg !50
+  %568 = fcmp oeq float %449, 0.000000e+00, !dbg !54
+  %569 = select i1 %568, float 0.000000e+00, float %450, !dbg !55
+  %570 = fmul float %569, %567, !dbg !48
+  %571 = fadd float %563, %570, !dbg !49
+  %572 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %115, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !60
+  %573 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %116, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !60
+  %574 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %117, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !60
+  %575 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %118, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !60
+  %576 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %571, float 2.560000e+02) #6, !dbg !61
+  %577 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %571, float 2.560000e+02) #6, !dbg !61
+  %578 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %571, float 2.560000e+02) #6, !dbg !61
+  %579 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %571, float 2.560000e+02) #6, !dbg !61
+  %580 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %571, float 2.560000e+02) #6, !dbg !61
+  %581 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %571, float 2.560000e+02) #6, !dbg !61
+  %582 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %571, float 2.560000e+02) #6, !dbg !61
+  %583 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %571, float 2.560000e+02) #6, !dbg !61
+  %584 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %561, float 2.560000e+02) #6, !dbg !61
+  %585 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %561, float 2.560000e+02) #6, !dbg !61
+  %586 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %561, float 2.560000e+02) #6, !dbg !61
+  %587 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %561, float 2.560000e+02) #6, !dbg !61
+  %588 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %561, float 2.560000e+02) #6, !dbg !61
+  %589 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %561, float 2.560000e+02) #6, !dbg !61
+  %590 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %561, float 2.560000e+02) #6, !dbg !61
+  %591 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %561, float 2.560000e+02) #6, !dbg !61
+  %592 = fadd float %576, 0x3EE4F8B580000000, !dbg !62
+  %593 = fadd float %584, 0x3EE4F8B580000000, !dbg !62
+  %594 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
+  %.not.i = icmp eq i32 %594, 0, !dbg !63
+  br i1 %.not.i, label %597, label %595, !dbg !63
+595:                                              ; preds = %551
+  %596 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %592), !dbg !63
+  br label %__nv_rsqrtf.exit, !dbg !63
+597:                                              ; preds = %551
+  %598 = tail call float @llvm.nvvm.rsqrt.approx.f(float %592), !dbg !63
+  br label %__nv_rsqrtf.exit, !dbg !63
+__nv_rsqrtf.exit:                                 ; preds = %595, %597
+  %.0.i = phi float [ %596, %595 ], [ %598, %597 ], !dbg !63
+  %599 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
+  %600 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
+  %601 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
+  %602 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
+  %603 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
+  %604 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
+  %605 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
+  %606 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
+  %.not.i22 = icmp eq i32 %606, 0, !dbg !63
+  br i1 %.not.i22, label %609, label %607, !dbg !63
+607:                                              ; preds = %__nv_rsqrtf.exit
+  %608 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %593), !dbg !63
+  br label %__nv_rsqrtf.exit24, !dbg !63
+609:                                              ; preds = %__nv_rsqrtf.exit
+  %610 = tail call float @llvm.nvvm.rsqrt.approx.f(float %593), !dbg !63
+  br label %__nv_rsqrtf.exit24, !dbg !63
+__nv_rsqrtf.exit24:                               ; preds = %607, %609
+  %.0.i23 = phi float [ %608, %607 ], [ %610, %609 ], !dbg !63
+  %611 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
+  %612 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
+  %613 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
+  %614 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
+  %615 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
+  %616 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
+  %617 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
+  %618 = extractvalue { i32, i32, i32, i32 } %575, 3, !dbg !60
+  %619 = bitcast i32 %618 to float, !dbg !60
+  %620 = extractvalue { i32, i32, i32, i32 } %546, 3, !dbg !56
+  %621 = bitcast i32 %620 to float, !dbg !56
+  %622 = fadd float %621, %619, !dbg !64
+  %623 = fmul float %555, %559, !dbg !44
+  %624 = fadd float %528, %623, !dbg !45
+  %625 = fsub float %622, %624, !dbg !65
+  %626 = extractvalue { i32, i32, i32, i32 } %575, 2, !dbg !60
+  %627 = bitcast i32 %626 to float, !dbg !60
+  %628 = extractvalue { i32, i32, i32, i32 } %546, 2, !dbg !56
+  %629 = bitcast i32 %628 to float, !dbg !56
+  %630 = fadd float %629, %627, !dbg !64
+  %631 = fsub float %630, %624, !dbg !65
+  %632 = extractvalue { i32, i32, i32, i32 } %575, 1, !dbg !60
+  %633 = bitcast i32 %632 to float, !dbg !60
+  %634 = extractvalue { i32, i32, i32, i32 } %546, 1, !dbg !56
+  %635 = bitcast i32 %634 to float, !dbg !56
+  %636 = fadd float %635, %633, !dbg !64
+  %637 = fsub float %636, %624, !dbg !65
+  %638 = extractvalue { i32, i32, i32, i32 } %575, 0, !dbg !60
+  %639 = bitcast i32 %638 to float, !dbg !60
+  %640 = extractvalue { i32, i32, i32, i32 } %546, 0, !dbg !56
+  %641 = bitcast i32 %640 to float, !dbg !56
+  %642 = fadd float %641, %639, !dbg !64
+  %643 = fsub float %642, %624, !dbg !65
+  %644 = extractvalue { i32, i32, i32, i32 } %574, 3, !dbg !60
+  %645 = bitcast i32 %644 to float, !dbg !60
+  %646 = extractvalue { i32, i32, i32, i32 } %545, 3, !dbg !56
+  %647 = bitcast i32 %646 to float, !dbg !56
+  %648 = fadd float %647, %645, !dbg !64
+  %649 = fsub float %648, %624, !dbg !65
+  %650 = extractvalue { i32, i32, i32, i32 } %574, 2, !dbg !60
+  %651 = bitcast i32 %650 to float, !dbg !60
+  %652 = extractvalue { i32, i32, i32, i32 } %545, 2, !dbg !56
+  %653 = bitcast i32 %652 to float, !dbg !56
+  %654 = fadd float %653, %651, !dbg !64
+  %655 = fsub float %654, %624, !dbg !65
+  %656 = extractvalue { i32, i32, i32, i32 } %574, 1, !dbg !60
+  %657 = bitcast i32 %656 to float, !dbg !60
+  %658 = extractvalue { i32, i32, i32, i32 } %545, 1, !dbg !56
+  %659 = bitcast i32 %658 to float, !dbg !56
+  %660 = fadd float %659, %657, !dbg !64
+  %661 = fsub float %660, %624, !dbg !65
+  %662 = extractvalue { i32, i32, i32, i32 } %574, 0, !dbg !60
+  %663 = bitcast i32 %662 to float, !dbg !60
+  %664 = extractvalue { i32, i32, i32, i32 } %545, 0, !dbg !56
+  %665 = bitcast i32 %664 to float, !dbg !56
+  %666 = fadd float %665, %663, !dbg !64
+  %667 = fsub float %666, %624, !dbg !65
+  %668 = extractvalue { i32, i32, i32, i32 } %573, 3, !dbg !60
+  %669 = bitcast i32 %668 to float, !dbg !60
+  %670 = extractvalue { i32, i32, i32, i32 } %544, 3, !dbg !56
+  %671 = bitcast i32 %670 to float, !dbg !56
+  %672 = fadd float %671, %669, !dbg !64
+  %673 = fmul float %565, %569, !dbg !44
+  %674 = fadd float %436, %673, !dbg !45
+  %675 = fsub float %672, %674, !dbg !65
+  %676 = extractvalue { i32, i32, i32, i32 } %573, 2, !dbg !60
+  %677 = bitcast i32 %676 to float, !dbg !60
+  %678 = extractvalue { i32, i32, i32, i32 } %544, 2, !dbg !56
+  %679 = bitcast i32 %678 to float, !dbg !56
+  %680 = fadd float %679, %677, !dbg !64
+  %681 = fsub float %680, %674, !dbg !65
+  %682 = extractvalue { i32, i32, i32, i32 } %573, 1, !dbg !60
+  %683 = bitcast i32 %682 to float, !dbg !60
+  %684 = extractvalue { i32, i32, i32, i32 } %544, 1, !dbg !56
+  %685 = bitcast i32 %684 to float, !dbg !56
+  %686 = fadd float %685, %683, !dbg !64
+  %687 = fsub float %686, %674, !dbg !65
+  %688 = extractvalue { i32, i32, i32, i32 } %573, 0, !dbg !60
+  %689 = bitcast i32 %688 to float, !dbg !60
+  %690 = extractvalue { i32, i32, i32, i32 } %544, 0, !dbg !56
+  %691 = bitcast i32 %690 to float, !dbg !56
+  %692 = fadd float %691, %689, !dbg !64
+  %693 = fsub float %692, %674, !dbg !65
+  %694 = extractvalue { i32, i32, i32, i32 } %572, 3, !dbg !60
+  %695 = bitcast i32 %694 to float, !dbg !60
+  %696 = extractvalue { i32, i32, i32, i32 } %543, 3, !dbg !56
+  %697 = bitcast i32 %696 to float, !dbg !56
+  %698 = fadd float %697, %695, !dbg !64
+  %699 = fsub float %698, %674, !dbg !65
+  %700 = extractvalue { i32, i32, i32, i32 } %572, 2, !dbg !60
+  %701 = bitcast i32 %700 to float, !dbg !60
+  %702 = extractvalue { i32, i32, i32, i32 } %543, 2, !dbg !56
+  %703 = bitcast i32 %702 to float, !dbg !56
+  %704 = fadd float %703, %701, !dbg !64
+  %705 = fsub float %704, %674, !dbg !65
+  %706 = extractvalue { i32, i32, i32, i32 } %572, 1, !dbg !60
+  %707 = bitcast i32 %706 to float, !dbg !60
+  %708 = extractvalue { i32, i32, i32, i32 } %543, 1, !dbg !56
+  %709 = bitcast i32 %708 to float, !dbg !56
+  %710 = fadd float %709, %707, !dbg !64
+  %711 = fsub float %710, %674, !dbg !65
+  %712 = extractvalue { i32, i32, i32, i32 } %572, 0, !dbg !60
+  %713 = bitcast i32 %712 to float, !dbg !60
+  %714 = extractvalue { i32, i32, i32, i32 } %543, 0, !dbg !56
+  %715 = bitcast i32 %714 to float, !dbg !56
+  %716 = fadd float %715, %713, !dbg !64
+  %717 = fsub float %716, %674, !dbg !65
+  %718 = fmul float %717, %.0.i, !dbg !66
+  %719 = fmul float %711, %.0.i, !dbg !66
+  %720 = fmul float %705, %.0.i, !dbg !66
+  %721 = fmul float %699, %.0.i, !dbg !66
+  %722 = fmul float %693, %.0.i, !dbg !66
+  %723 = fmul float %687, %.0.i, !dbg !66
+  %724 = fmul float %681, %.0.i, !dbg !66
+  %725 = fmul float %675, %.0.i, !dbg !66
+  %726 = fmul float %667, %.0.i23, !dbg !66
+  %727 = fmul float %661, %.0.i23, !dbg !66
+  %728 = fmul float %655, %.0.i23, !dbg !66
+  %729 = fmul float %649, %.0.i23, !dbg !66
+  %730 = fmul float %643, %.0.i23, !dbg !66
+  %731 = fmul float %637, %.0.i23, !dbg !66
+  %732 = fmul float %631, %.0.i23, !dbg !66
+  %733 = fmul float %625, %.0.i23, !dbg !66
+  %734 = getelementptr float, ptr addrspace(3) @global_smem, i64 %547, !dbg !67
+  store i32 %549, ptr addrspace(3) %734, align 4, !dbg !67
+  tail call void @llvm.nvvm.barrier0(), !dbg !67
+  %735 = getelementptr float, ptr addrspace(3) @global_smem, i64 %109, !dbg !67
+  %736 = load float, ptr addrspace(3) %735, align 32, !dbg !67
+  %737 = getelementptr inbounds <8 x float>, ptr addrspace(3) %735, i64 0, i64 1, !dbg !67
+  %738 = load float, ptr addrspace(3) %737, align 4, !dbg !67
+  %739 = getelementptr inbounds <8 x float>, ptr addrspace(3) %735, i64 0, i64 2, !dbg !67
+  %740 = load float, ptr addrspace(3) %739, align 8, !dbg !67
+  %741 = getelementptr inbounds <8 x float>, ptr addrspace(3) %735, i64 0, i64 3, !dbg !67
+  %742 = load float, ptr addrspace(3) %741, align 4, !dbg !67
+  %743 = getelementptr inbounds <8 x float>, ptr addrspace(3) %735, i64 0, i64 4, !dbg !67
+  %744 = load float, ptr addrspace(3) %743, align 16, !dbg !67
+  %745 = getelementptr inbounds <8 x float>, ptr addrspace(3) %735, i64 0, i64 5, !dbg !67
+  %746 = load float, ptr addrspace(3) %745, align 4, !dbg !67
+  %747 = getelementptr inbounds <8 x float>, ptr addrspace(3) %735, i64 0, i64 6, !dbg !67
+  %748 = load float, ptr addrspace(3) %747, align 8, !dbg !67
+  %749 = getelementptr inbounds <8 x float>, ptr addrspace(3) %735, i64 0, i64 7, !dbg !67
+  %750 = load float, ptr addrspace(3) %749, align 4, !dbg !67
+  %751 = fmul float %718, %736, !dbg !67
+  %752 = fmul float %719, %738, !dbg !67
+  %753 = fmul float %720, %740, !dbg !67
+  %754 = fmul float %721, %742, !dbg !67
+  %755 = fmul float %722, %744, !dbg !67
+  %756 = fmul float %723, %746, !dbg !67
+  %757 = fmul float %724, %748, !dbg !67
+  %758 = fmul float %725, %750, !dbg !67
+  %759 = fmul float %726, %736, !dbg !67
+  %760 = fmul float %727, %738, !dbg !67
+  %761 = fmul float %728, %740, !dbg !67
+  %762 = fmul float %729, %742, !dbg !67
+  %763 = fmul float %730, %744, !dbg !67
+  %764 = fmul float %731, %746, !dbg !67
+  %765 = fmul float %732, %748, !dbg !67
+  %766 = fmul float %733, %750, !dbg !67
+  %767 = shl i32 %17, 8, !dbg !68
+  %768 = shl i32 %18, 8, !dbg !68
+  %769 = or i32 %767, %13, !dbg !69
+  %770 = or i32 %768, %13, !dbg !69
+  %771 = sext i32 %769 to i64, !dbg !70
+  %772 = getelementptr i16, ptr addrspace(1) %4, i64 %771, !dbg !70
+  %773 = sext i32 %770 to i64, !dbg !70
+  %774 = getelementptr i16, ptr addrspace(1) %4, i64 %773, !dbg !70
+  %775 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %751) #6, !dbg !71
+  %776 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %752) #6, !dbg !71
+  %777 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %753) #6, !dbg !71
+  %778 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %754) #6, !dbg !71
+  %779 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %755) #6, !dbg !71
+  %780 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %756) #6, !dbg !71
+  %781 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %757) #6, !dbg !71
+  %782 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %758) #6, !dbg !71
+  %783 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %759) #6, !dbg !71
+  %784 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %760) #6, !dbg !71
+  %785 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %761) #6, !dbg !71
+  %786 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %762) #6, !dbg !71
+  %787 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %763) #6, !dbg !71
+  %788 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %764) #6, !dbg !71
+  %789 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %765) #6, !dbg !71
+  %790 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %766) #6, !dbg !71
+  %791 = insertelement <2 x i16> undef, i16 %775, i64 0, !dbg !71
+  %792 = insertelement <2 x i16> %791, i16 %776, i64 1, !dbg !71
+  %793 = bitcast <2 x i16> %792 to i32, !dbg !71
+  %794 = insertelement <2 x i16> undef, i16 %777, i64 0, !dbg !71
+  %795 = insertelement <2 x i16> %794, i16 %778, i64 1, !dbg !71
+  %796 = bitcast <2 x i16> %795 to i32, !dbg !71
+  %797 = insertelement <2 x i16> undef, i16 %779, i64 0, !dbg !71
+  %798 = insertelement <2 x i16> %797, i16 %780, i64 1, !dbg !71
+  %799 = bitcast <2 x i16> %798 to i32, !dbg !71
+  %800 = insertelement <2 x i16> undef, i16 %781, i64 0, !dbg !71
+  %801 = insertelement <2 x i16> %800, i16 %782, i64 1, !dbg !71
+  %802 = bitcast <2 x i16> %801 to i32, !dbg !71
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %793, i32 %796, i32 %799, i32 %802, ptr addrspace(1) %772, i1 true) #6, !dbg !71
+  %803 = insertelement <2 x i16> undef, i16 %783, i64 0, !dbg !71
+  %804 = insertelement <2 x i16> %803, i16 %784, i64 1, !dbg !71
+  %805 = bitcast <2 x i16> %804 to i32, !dbg !71
+  %806 = insertelement <2 x i16> undef, i16 %785, i64 0, !dbg !71
+  %807 = insertelement <2 x i16> %806, i16 %786, i64 1, !dbg !71
+  %808 = bitcast <2 x i16> %807 to i32, !dbg !71
+  %809 = insertelement <2 x i16> undef, i16 %787, i64 0, !dbg !71
+  %810 = insertelement <2 x i16> %809, i16 %788, i64 1, !dbg !71
+  %811 = bitcast <2 x i16> %810 to i32, !dbg !71
+  %812 = insertelement <2 x i16> undef, i16 %789, i64 0, !dbg !71
+  %813 = insertelement <2 x i16> %812, i16 %790, i64 1, !dbg !71
+  %814 = bitcast <2 x i16> %813 to i32, !dbg !71
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %805, i32 %808, i32 %811, i32 %814, ptr addrspace(1) %774, i1 true) #6, !dbg !71
+  ret void, !dbg !72
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "clhe4a3stvufxafmq3kk5hodazz2efctffte646znjdnv3lqi5oa.py", directory: "/tmp/torchinductor_root/lh")
+!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 256}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 22, column: 44, scope: !7)
+!11 = !DILocation(line: 24, column: 33, scope: !7)
+!12 = !DILocation(line: 21, column: 28, scope: !7)
+!13 = !DILocation(line: 21, column: 33, scope: !7)
+!14 = !DILocation(line: 22, column: 23, scope: !7)
+!15 = !DILocation(line: 26, column: 30, scope: !7)
+!16 = !DILocation(line: 26, column: 35, scope: !7)
+!17 = !DILocation(line: 27, column: 18, scope: !7)
+!18 = !DILocation(line: 35, column: 44, scope: !7)
+!19 = !DILocation(line: 35, column: 40, scope: !7)
+!20 = !DILocation(line: 35, column: 34, scope: !7)
+!21 = !DILocation(line: 35, column: 50, scope: !7)
+!22 = !DILocation(line: 36, column: 22, scope: !7)
+!23 = !DILocation(line: 37, column: 22, scope: !7)
+!24 = !DILocation(line: 38, column: 36, scope: !7)
+!25 = !DILocation(line: 39, column: 40, scope: !7)
+!26 = !DILocation(line: 39, column: 55, scope: !7)
+!27 = !DILocation(line: 40, column: 44, scope: !7)
+!28 = !DILocation(line: 40, column: 40, scope: !7)
+!29 = !DILocation(line: 40, column: 34, scope: !7)
+!30 = !DILocation(line: 40, column: 52, scope: !7)
+!31 = !DILocation(line: 41, column: 22, scope: !7)
+!32 = !DILocation(line: 98, column: 22, scope: !33, inlinedAt: !35)
+!33 = distinct !DILexicalBlockFile(scope: !7, file: !34, discriminator: 0)
+!34 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!35 = !DILocation(line: 44, column: 38, scope: !33)
+!36 = !DILocation(line: 101, column: 30, scope: !33, inlinedAt: !35)
+!37 = !DILocation(line: 101, column: 22, scope: !33, inlinedAt: !35)
+!38 = !DILocation(line: 101, column: 13, scope: !33, inlinedAt: !35)
+!39 = !DILocation(line: 108, column: 21, scope: !40, inlinedAt: !41)
+!40 = distinct !DILexicalBlockFile(scope: !33, file: !34, discriminator: 0)
+!41 = !DILocation(line: 120, column: 46, scope: !40, inlinedAt: !42)
+!42 = !DILocation(line: 50, column: 41, scope: !40)
+!43 = !DILocation(line: 110, column: 60, scope: !40, inlinedAt: !41)
+!44 = !DILocation(line: 112, column: 25, scope: !40, inlinedAt: !41)
+!45 = !DILocation(line: 112, column: 17, scope: !40, inlinedAt: !41)
+!46 = !DILocation(line: 113, column: 15, scope: !40, inlinedAt: !41)
+!47 = !DILocation(line: 113, column: 30, scope: !40, inlinedAt: !41)
+!48 = !DILocation(line: 113, column: 49, scope: !40, inlinedAt: !41)
+!49 = !DILocation(line: 113, column: 22, scope: !40, inlinedAt: !41)
+!50 = !DILocation(line: 113, column: 38, scope: !40, inlinedAt: !41)
+!51 = !DILocation(line: 120, column: 46, scope: !33, inlinedAt: !52)
+!52 = !DILocation(line: 50, column: 41, scope: !33)
+!53 = !DILocation(line: 109, column: 28, scope: !40, inlinedAt: !41)
+!54 = !DILocation(line: 110, column: 39, scope: !40, inlinedAt: !41)
+!55 = !DILocation(line: 110, column: 49, scope: !40, inlinedAt: !41)
+!56 = !DILocation(line: 59, column: 51, scope: !7)
+!57 = !DILocation(line: 60, column: 35, scope: !7)
+!58 = !DILocation(line: 60, column: 40, scope: !7)
+!59 = !DILocation(line: 64, column: 57, scope: !7)
+!60 = !DILocation(line: 65, column: 54, scope: !7)
+!61 = !DILocation(line: 69, column: 23, scope: !7)
+!62 = !DILocation(line: 71, column: 24, scope: !7)
+!63 = !DILocation(line: 72, column: 30, scope: !7)
+!64 = !DILocation(line: 66, column: 24, scope: !7)
+!65 = !DILocation(line: 67, column: 24, scope: !7)
+!66 = !DILocation(line: 73, column: 24, scope: !7)
+!67 = !DILocation(line: 74, column: 24, scope: !7)
+!68 = !DILocation(line: 76, column: 39, scope: !7)
+!69 = !DILocation(line: 76, column: 35, scope: !7)
+!70 = !DILocation(line: 76, column: 29, scope: !7)
+!71 = !DILocation(line: 76, column: 52, scope: !7)
+!72 = !DILocation(line: 55, column: 4, scope: !7)

.triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.ptx ADDED Viewed

	@@ -0,0 +1,1654 @@

+//
+// Generated by LLVM NVPTX Back-End
+//
+.version 8.2
+.target sm_89
+.address_size 64
+	// .globl	triton__0d1d2d3d4d5de6de
+.extern .func __assertfail
+(
+	.param .b64 __assertfail_param_0,
+	.param .b64 __assertfail_param_1,
+	.param .b32 __assertfail_param_2,
+	.param .b64 __assertfail_param_3,
+	.param .b64 __assertfail_param_4
+)
+;
+.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 51, 32, 60, 32, 53, 48, 50, 53, 55};
+.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
+.extern .shared .align 1 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+.visible .entry triton__0d1d2d3d4d5de6de(
+	.param .u64 triton__0d1d2d3d4d5de6de_param_0,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_1,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_2,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_3,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_4,
+	.param .u32 triton__0d1d2d3d4d5de6de_param_5,
+	.param .u32 triton__0d1d2d3d4d5de6de_param_6
+)
+.maxntid 256, 1, 1
+{
+	.reg .pred 	%p<117>;
+	.reg .b16 	%rs<17>;
+	.reg .b32 	%r<375>;
+	.reg .f32 	%f<423>;
+	.reg .b64 	%rd<113>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+	ld.param.u64 	%rd13, [triton__0d1d2d3d4d5de6de_param_3];
+	ld.param.u64 	%rd12, [triton__0d1d2d3d4d5de6de_param_1];
+	ld.param.u64 	%rd53, [triton__0d1d2d3d4d5de6de_param_0];
+$L__tmp0:
+	.loc	1 22 44
+	mov.u32 	%r59, %tid.x;
+	ld.param.u64 	%rd54, [triton__0d1d2d3d4d5de6de_param_2];
+	bfe.u32 	%r60, %r59, 5, 3;
+	and.b32  	%r61, %r59, 15;
+	.loc	1 24 33
+	shl.b32 	%r62, %r59, 3;
+	and.b32  	%r1, %r62, 248;
+	and.b32  	%r2, %r59, 255;
+	.loc	1 21 28
+	mov.u32 %r26, %ctaid.x;
+	.loc	1 21 33
+	shl.b32 	%r63, %r26, 4;
+	.loc	1 22 23
+	or.b32  	%r3, %r63, %r60;
+	or.b32  	%r4, %r3, 8;
+	or.b32  	%r64, %r63, %r61;
+	.loc	1 26 30
+	mul.wide.s32 	%rd55, %r3, 8;
+	add.s64 	%rd16, %rd53, %rd55;
+	add.s64 	%rd32, %rd16, 64;
+	mul.wide.s32 	%rd56, %r64, 8;
+	add.s64 	%rd48, %rd53, %rd56;
+	mov.pred 	%p93, -1;
+	.loc	1 26 35
+	mov.u64 %rd15, 0x0;
+	@%p93 ld.global.L1::evict_last.b64 { %rd15 }, [ %rd16 + 0 ];
+	mov.u64 %rd17, 0x0;
+	@%p93 ld.global.L1::evict_last.b64 { %rd17 }, [ %rd16 + 0 ];
+	mov.u64 %rd19, 0x0;
+	@%p93 ld.global.L1::evict_last.b64 { %rd19 }, [ %rd16 + 0 ];
+	mov.u64 %rd21, 0x0;
+	@%p93 ld.global.L1::evict_last.b64 { %rd21 }, [ %rd16 + 0 ];
+	mov.u64 %rd23, 0x0;
+	@%p93 ld.global.L1::evict_last.b64 { %rd23 }, [ %rd16 + 0 ];
+	mov.u64 %rd25, 0x0;
+	@%p93 ld.global.L1::evict_last.b64 { %rd25 }, [ %rd16 + 0 ];
+	mov.u64 %rd27, 0x0;
+	@%p93 ld.global.L1::evict_last.b64 { %rd27 }, [ %rd16 + 0 ];
+	mov.u64 %rd29, 0x0;
+	@%p93 ld.global.L1::evict_last.b64 { %rd29 }, [ %rd16 + 0 ];
+	mov.u64 %rd31, 0x0;
+	@%p93 ld.global.L1::evict_last.b64 { %rd31 }, [ %rd32 + 0 ];
+	mov.u64 %rd33, 0x0;
+	@%p93 ld.global.L1::evict_last.b64 { %rd33 }, [ %rd32 + 0 ];
+	mov.u64 %rd35, 0x0;
+	@%p93 ld.global.L1::evict_last.b64 { %rd35 }, [ %rd32 + 0 ];
+	mov.u64 %rd37, 0x0;
+	@%p93 ld.global.L1::evict_last.b64 { %rd37 }, [ %rd32 + 0 ];
+	mov.u64 %rd39, 0x0;
+	@%p93 ld.global.L1::evict_last.b64 { %rd39 }, [ %rd32 + 0 ];
+	mov.u64 %rd41, 0x0;
+	@%p93 ld.global.L1::evict_last.b64 { %rd41 }, [ %rd32 + 0 ];
+	mov.u64 %rd43, 0x0;
+	@%p93 ld.global.L1::evict_last.b64 { %rd43 }, [ %rd32 + 0 ];
+	mov.u64 %rd45, 0x0;
+	@%p93 ld.global.L1::evict_last.b64 { %rd45 }, [ %rd32 + 0 ];
+	mov.u64 %rd47, 0x0;
+	@%p93 ld.global.L1::evict_last.b64 { %rd47 }, [ %rd48 + 0 ];
+	.loc	1 27 18
+	bfe.s32 	%r65, %r26, 27, 1;
+	shr.u32 	%r66, %r65, 23;
+	add.s32 	%r67, %r3, %r66;
+	and.b32  	%r68, %r67, 16776704;
+	sub.s32 	%r69, %r3, %r68;
+	add.s32 	%r70, %r4, %r66;
+	and.b32  	%r71, %r70, 16776704;
+	sub.s32 	%r72, %r4, %r71;
+	.loc	1 35 44
+	shl.b32 	%r73, %r69, 8;
+	shl.b32 	%r74, %r72, 8;
+	.loc	1 35 40
+	or.b32  	%r75, %r73, %r1;
+	or.b32  	%r76, %r74, %r1;
+	.loc	1 35 34
+	mul.wide.s32 	%rd57, %r75, 4;
+	add.s64 	%rd80, %rd54, %rd57;
+	cvt.s64.s32 	%rd58, %r73;
+	cvt.u64.u32 	%rd59, %r1;
+	or.b64  	%rd60, %rd58, %rd59;
+	shl.b64 	%rd61, %rd60, 2;
+	add.s64 	%rd62, %rd54, %rd61;
+	add.s64 	%rd81, %rd62, 16;
+	mul.wide.s32 	%rd63, %r76, 4;
+	add.s64 	%rd82, %rd54, %rd63;
+	cvt.s64.s32 	%rd64, %r74;
+	or.b64  	%rd65, %rd64, %rd59;
+	shl.b64 	%rd66, %rd65, 2;
+	add.s64 	%rd67, %rd54, %rd66;
+	add.s64 	%rd83, %rd67, 16;
+	mov.b32 	%r257, 0;
+	.loc	1 35 50
+	mov.u32 %r27, 0x0;
+	mov.u32 %r28, 0x0;
+	mov.u32 %r29, 0x0;
+	mov.u32 %r30, 0x0;
+	@%p93 ld.global.L1::evict_last.v4.b32 { %r27, %r28, %r29, %r30 }, [ %rd80 + 0 ];
+	@!%p93 mov.u32 %r27, %r257;
+	@!%p93 mov.u32 %r28, %r257;
+	@!%p93 mov.u32 %r29, %r257;
+	@!%p93 mov.u32 %r30, %r257;
+	mov.b32 	%f1, %r27;
+	mov.b32 	%f2, %r28;
+	mov.b32 	%f3, %r29;
+	mov.b32 	%f4, %r30;
+	mov.u32 %r35, 0x0;
+	mov.u32 %r36, 0x0;
+	mov.u32 %r37, 0x0;
+	mov.u32 %r38, 0x0;
+	@%p93 ld.global.L1::evict_last.v4.b32 { %r35, %r36, %r37, %r38 }, [ %rd81 + 0 ];
+	@!%p93 mov.u32 %r35, %r257;
+	@!%p93 mov.u32 %r36, %r257;
+	@!%p93 mov.u32 %r37, %r257;
+	@!%p93 mov.u32 %r38, %r257;
+	mov.b32 	%f5, %r35;
+	mov.b32 	%f6, %r36;
+	mov.b32 	%f7, %r37;
+	mov.b32 	%f8, %r38;
+	mov.u32 %r43, 0x0;
+	mov.u32 %r44, 0x0;
+	mov.u32 %r45, 0x0;
+	mov.u32 %r46, 0x0;
+	@%p93 ld.global.L1::evict_last.v4.b32 { %r43, %r44, %r45, %r46 }, [ %rd82 + 0 ];
+	@!%p93 mov.u32 %r43, %r257;
+	@!%p93 mov.u32 %r44, %r257;
+	@!%p93 mov.u32 %r45, %r257;
+	@!%p93 mov.u32 %r46, %r257;
+	mov.b32 	%f9, %r43;
+	mov.b32 	%f10, %r44;
+	mov.b32 	%f11, %r45;
+	mov.b32 	%f12, %r46;
+	mov.u32 %r51, 0x0;
+	mov.u32 %r52, 0x0;
+	mov.u32 %r53, 0x0;
+	mov.u32 %r54, 0x0;
+	@%p93 ld.global.L1::evict_last.v4.b32 { %r51, %r52, %r53, %r54 }, [ %rd83 + 0 ];
+	@!%p93 mov.u32 %r51, %r257;
+	@!%p93 mov.u32 %r52, %r257;
+	@!%p93 mov.u32 %r53, %r257;
+	@!%p93 mov.u32 %r54, %r257;
+	mov.b32 	%f13, %r51;
+	mov.b32 	%f14, %r52;
+	mov.b32 	%f15, %r53;
+	mov.b32 	%f16, %r54;
+	.loc	1 36 22
+	add.s64 	%rd68, %rd47, 50257;
+	.loc	1 37 22
+	setp.lt.s64 	%p38, %rd47, 0;
+	.loc	1 38 36
+	selp.b64 	%rd7, %rd68, %rd47, %p38;
+	.loc	1 39 40
+	setp.lt.u64 	%p39, %rd7, 50257;
+	mov.b32 	%r374, 883;
+	mov.u64 	%rd112, 1;
+	.loc	1 39 55
+	@%p39 bra 	$L__BB0_2;
+	mov.u64 	%rd69, assertMessage_0;
+	cvta.global.u64 	%rd70, %rd69;
+	mov.u64 	%rd71, assertFile_0;
+	cvta.global.u64 	%rd72, %rd71;
+	mov.u64 	%rd73, assertFunc_0;
+	cvta.global.u64 	%rd74, %rd73;
+	{ // callseq 8, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd70;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd72;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r374;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd74;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd112;
+	call.uni
+	__assertfail,
+	(
+	param0,
+	param1,
+	param2,
+	param3,
+	param4
+	);
+	} // callseq 8
+$L__BB0_2:
+	.loc	1 0 55
+	ld.param.u64 	%rd14, [triton__0d1d2d3d4d5de6de_param_4];
+	.loc	1 37 22
+	setp.lt.s64 	%p83, %rd31, 0;
+	setp.lt.s64 	%p84, %rd15, 0;
+	.loc	1 40 44
+	shl.b64 	%rd85, %rd15, 8;
+	add.s64 	%rd86, %rd85, 12865792;
+	selp.b64 	%rd87, %rd86, %rd85, %p84;
+	shl.b64 	%rd88, %rd31, 8;
+	add.s64 	%rd89, %rd88, 12865792;
+	selp.b64 	%rd90, %rd89, %rd88, %p83;
+	.loc	1 40 40
+	or.b64  	%rd92, %rd87, %rd59;
+	or.b64  	%rd93, %rd90, %rd59;
+	.loc	1 40 34
+	shl.b64 	%rd94, %rd92, 2;
+	add.s64 	%rd104, %rd12, %rd94;
+	add.s64 	%rd105, %rd104, 16;
+	shl.b64 	%rd95, %rd93, 2;
+	add.s64 	%rd106, %rd12, %rd95;
+	add.s64 	%rd107, %rd106, 16;
+	.loc	1 40 52
+	mov.u32 %r78, 0x0;
+	mov.u32 %r79, 0x0;
+	mov.u32 %r80, 0x0;
+	mov.u32 %r81, 0x0;
+	@%p93 ld.global.L1::evict_last.v4.b32 { %r78, %r79, %r80, %r81 }, [ %rd104 + 0 ];
+	@!%p93 mov.u32 %r78, %r257;
+	@!%p93 mov.u32 %r79, %r257;
+	@!%p93 mov.u32 %r80, %r257;
+	@!%p93 mov.u32 %r81, %r257;
+	mov.b32 	%f27, %r78;
+	mov.b32 	%f28, %r79;
+	mov.b32 	%f29, %r80;
+	mov.b32 	%f30, %r81;
+	mov.u32 %r86, 0x0;
+	mov.u32 %r87, 0x0;
+	mov.u32 %r88, 0x0;
+	mov.u32 %r89, 0x0;
+	@%p93 ld.global.L1::evict_last.v4.b32 { %r86, %r87, %r88, %r89 }, [ %rd105 + 0 ];
+	@!%p93 mov.u32 %r86, %r257;
+	@!%p93 mov.u32 %r87, %r257;
+	@!%p93 mov.u32 %r88, %r257;
+	@!%p93 mov.u32 %r89, %r257;
+	mov.b32 	%f31, %r86;
+	mov.b32 	%f32, %r87;
+	mov.b32 	%f33, %r88;
+	mov.b32 	%f34, %r89;
+	mov.u32 %r94, 0x0;
+	mov.u32 %r95, 0x0;
+	mov.u32 %r96, 0x0;
+	mov.u32 %r97, 0x0;
+	@%p93 ld.global.L1::evict_last.v4.b32 { %r94, %r95, %r96, %r97 }, [ %rd106 + 0 ];
+	@!%p93 mov.u32 %r94, %r257;
+	@!%p93 mov.u32 %r95, %r257;
+	@!%p93 mov.u32 %r96, %r257;
+	@!%p93 mov.u32 %r97, %r257;
+	mov.b32 	%f35, %r94;
+	mov.b32 	%f36, %r95;
+	mov.b32 	%f37, %r96;
+	mov.b32 	%f38, %r97;
+	mov.u32 %r102, 0x0;
+	mov.u32 %r103, 0x0;
+	mov.u32 %r104, 0x0;
+	mov.u32 %r105, 0x0;
+	@%p93 ld.global.L1::evict_last.v4.b32 { %r102, %r103, %r104, %r105 }, [ %rd107 + 0 ];
+	@!%p93 mov.u32 %r102, %r257;
+	@!%p93 mov.u32 %r103, %r257;
+	@!%p93 mov.u32 %r104, %r257;
+	@!%p93 mov.u32 %r105, %r257;
+	mov.b32 	%f39, %r102;
+	mov.b32 	%f40, %r103;
+	mov.b32 	%f41, %r104;
+	mov.b32 	%f42, %r105;
+	.loc	1 41 22
+	add.f32 	%f43, %f1, %f27;
+	add.f32 	%f44, %f2, %f28;
+	add.f32 	%f45, %f3, %f29;
+	add.f32 	%f46, %f4, %f30;
+	add.f32 	%f47, %f5, %f31;
+	add.f32 	%f48, %f6, %f32;
+	add.f32 	%f49, %f7, %f33;
+	add.f32 	%f50, %f8, %f34;
+	add.f32 	%f51, %f9, %f35;
+	add.f32 	%f52, %f10, %f36;
+	add.f32 	%f53, %f11, %f37;
+	add.f32 	%f54, %f12, %f38;
+	add.f32 	%f55, %f13, %f39;
+	add.f32 	%f56, %f14, %f40;
+	add.f32 	%f57, %f15, %f41;
+	add.f32 	%f58, %f16, %f42;
+$L__tmp1:
+	.loc	2 98 22
+	add.f32 	%f59, %f43, 0f00000000;
+	add.f32 	%f60, %f44, 0f00000000;
+	add.f32 	%f61, %f45, 0f00000000;
+	add.f32 	%f62, %f46, 0f00000000;
+	add.f32 	%f63, %f47, 0f00000000;
+	add.f32 	%f64, %f48, 0f00000000;
+	add.f32 	%f65, %f49, 0f00000000;
+	add.f32 	%f66, %f50, 0f00000000;
+	add.f32 	%f67, %f51, 0f00000000;
+	add.f32 	%f68, %f52, 0f00000000;
+	add.f32 	%f69, %f53, 0f00000000;
+	add.f32 	%f70, %f54, 0f00000000;
+	add.f32 	%f71, %f55, 0f00000000;
+	add.f32 	%f72, %f56, 0f00000000;
+	add.f32 	%f73, %f57, 0f00000000;
+	add.f32 	%f74, %f58, 0f00000000;
+	.loc	2 101 30
+	sub.f32 	%f75, %f43, %f59;
+	sub.f32 	%f76, %f44, %f60;
+	sub.f32 	%f77, %f45, %f61;
+	sub.f32 	%f78, %f46, %f62;
+	sub.f32 	%f79, %f47, %f63;
+	sub.f32 	%f80, %f48, %f64;
+	sub.f32 	%f81, %f49, %f65;
+	sub.f32 	%f82, %f50, %f66;
+	sub.f32 	%f83, %f51, %f67;
+	sub.f32 	%f84, %f52, %f68;
+	sub.f32 	%f85, %f53, %f69;
+	sub.f32 	%f86, %f54, %f70;
+	sub.f32 	%f87, %f55, %f71;
+	sub.f32 	%f88, %f56, %f72;
+	sub.f32 	%f89, %f57, %f73;
+	sub.f32 	%f90, %f58, %f74;
+	.loc	2 101 13
+	fma.rn.f32 	%f91, %f43, %f75, 0f00000000;
+	fma.rn.f32 	%f92, %f44, %f76, 0f00000000;
+	fma.rn.f32 	%f93, %f45, %f77, 0f00000000;
+	fma.rn.f32 	%f94, %f46, %f78, 0f00000000;
+	fma.rn.f32 	%f95, %f47, %f79, 0f00000000;
+	fma.rn.f32 	%f96, %f48, %f80, 0f00000000;
+	fma.rn.f32 	%f97, %f49, %f81, 0f00000000;
+	fma.rn.f32 	%f98, %f50, %f82, 0f00000000;
+	fma.rn.f32 	%f99, %f51, %f83, 0f00000000;
+	fma.rn.f32 	%f100, %f52, %f84, 0f00000000;
+	fma.rn.f32 	%f101, %f53, %f85, 0f00000000;
+	fma.rn.f32 	%f102, %f54, %f86, 0f00000000;
+	fma.rn.f32 	%f103, %f55, %f87, 0f00000000;
+	fma.rn.f32 	%f104, %f56, %f88, 0f00000000;
+	fma.rn.f32 	%f105, %f57, %f89, 0f00000000;
+	fma.rn.f32 	%f106, %f58, %f90, 0f00000000;
+$L__tmp2:
+	.loc	2 108 21
+	sub.f32 	%f107, %f60, %f59;
+	mov.b32 	%r111, 1065353216;
+	mov.b32 	%r112, 1073741824;
+	.loc	2 110 60
+	div.full.f32 %r110, %r111, %r112;
+	mov.b32 	%f108, %r110;
+	.loc	2 112 17
+	fma.rn.f32 	%f109, %f108, %f107, %f59;
+	.loc	2 113 15
+	add.f32 	%f110, %f91, %f92;
+	.loc	2 113 30
+	mul.f32 	%f111, %f107, %f107;
+	.loc	2 113 22
+	fma.rn.f32 	%f112, %f108, %f111, %f110;
+	.loc	2 108 21
+	sub.f32 	%f113, %f61, %f109;
+	mov.b32 	%r115, 1077936128;
+	.loc	2 110 60
+	div.full.f32 %r113, %r111, %r115;
+	mov.b32 	%f114, %r113;
+	.loc	2 112 17
+	fma.rn.f32 	%f115, %f114, %f113, %f109;
+	.loc	2 113 15
+	add.f32 	%f116, %f93, %f112;
+	.loc	2 113 30
+	mul.f32 	%f117, %f113, %f113;
+	.loc	2 113 38
+	fma.rn.f32 	%f118, %f113, %f113, %f117;
+	.loc	2 113 22
+	fma.rn.f32 	%f119, %f114, %f118, %f116;
+	.loc	2 108 21
+	sub.f32 	%f120, %f62, %f115;
+	mov.b32 	%r118, 1082130432;
+	.loc	2 110 60
+	div.full.f32 %r116, %r111, %r118;
+	mov.b32 	%f121, %r116;
+	.loc	2 112 17
+	fma.rn.f32 	%f122, %f121, %f120, %f115;
+	.loc	2 113 15
+	add.f32 	%f123, %f94, %f119;
+	.loc	2 113 30
+	mul.f32 	%f124, %f120, %f120;
+	.loc	2 113 38
+	mul.f32 	%f125, %f124, 0f40400000;
+	.loc	2 113 22
+	fma.rn.f32 	%f126, %f121, %f125, %f123;
+	.loc	2 108 21
+	sub.f32 	%f127, %f63, %f122;
+	mov.b32 	%r121, 1084227584;
+	.loc	2 110 60
+	div.full.f32 %r119, %r111, %r121;
+	mov.b32 	%f128, %r119;
+	.loc	2 112 17
+	fma.rn.f32 	%f129, %f128, %f127, %f122;
+	.loc	2 113 15
+	add.f32 	%f130, %f95, %f126;
+	.loc	2 113 30
+	mul.f32 	%f131, %f127, %f127;
+	.loc	2 113 38
+	mul.f32 	%f132, %f131, 0f40800000;
+	.loc	2 113 22
+	fma.rn.f32 	%f133, %f128, %f132, %f130;
+	.loc	2 108 21
+	sub.f32 	%f134, %f64, %f129;
+	mov.b32 	%r124, 1086324736;
+	.loc	2 110 60
+	div.full.f32 %r122, %r111, %r124;
+	mov.b32 	%f135, %r122;
+	.loc	2 112 17
+	fma.rn.f32 	%f136, %f135, %f134, %f129;
+	.loc	2 113 15
+	add.f32 	%f137, %f96, %f133;
+	.loc	2 113 30
+	mul.f32 	%f138, %f134, %f134;
+	.loc	2 113 38
+	mul.f32 	%f139, %f138, 0f40A00000;
+	.loc	2 113 22
+	fma.rn.f32 	%f140, %f135, %f139, %f137;
+	.loc	2 108 21
+	sub.f32 	%f141, %f65, %f136;
+	mov.b32 	%r127, 1088421888;
+	.loc	2 110 60
+	div.full.f32 %r125, %r111, %r127;
+	mov.b32 	%f142, %r125;
+	.loc	2 112 17
+	fma.rn.f32 	%f143, %f142, %f141, %f136;
+	.loc	2 113 15
+	add.f32 	%f144, %f97, %f140;
+	.loc	2 113 30
+	mul.f32 	%f145, %f141, %f141;
+	.loc	2 113 38
+	mul.f32 	%f146, %f145, 0f40C00000;
+	.loc	2 113 22
+	fma.rn.f32 	%f147, %f142, %f146, %f144;
+	.loc	2 108 21
+	sub.f32 	%f148, %f66, %f143;
+	mov.b32 	%r130, 1090519040;
+	.loc	2 110 60
+	div.full.f32 %r128, %r111, %r130;
+	mov.b32 	%f149, %r128;
+	.loc	2 112 17
+	fma.rn.f32 	%f150, %f149, %f148, %f143;
+	.loc	2 113 15
+	add.f32 	%f151, %f98, %f147;
+	.loc	2 113 30
+	mul.f32 	%f152, %f148, %f148;
+	.loc	2 113 38
+	mul.f32 	%f153, %f152, 0f40E00000;
+	.loc	2 113 22
+	fma.rn.f32 	%f154, %f149, %f153, %f151;
+	.loc	2 108 21
+	sub.f32 	%f155, %f68, %f67;
+	.loc	2 110 60
+	div.full.f32 %r131, %r111, %r112;
+	mov.b32 	%f156, %r131;
+	.loc	2 112 17
+	fma.rn.f32 	%f157, %f155, %f156, %f67;
+	.loc	2 113 15
+	add.f32 	%f158, %f99, %f100;
+	.loc	2 113 30
+	mul.f32 	%f159, %f155, %f155;
+	.loc	2 113 22
+	fma.rn.f32 	%f160, %f159, %f156, %f158;
+	.loc	2 108 21
+	sub.f32 	%f161, %f69, %f157;
+	.loc	2 110 60
+	div.full.f32 %r134, %r111, %r115;
+	mov.b32 	%f162, %r134;
+	.loc	2 112 17
+	fma.rn.f32 	%f163, %f162, %f161, %f157;
+	.loc	2 113 15
+	add.f32 	%f164, %f101, %f160;
+	.loc	2 113 30
+	mul.f32 	%f165, %f161, %f161;
+	.loc	2 113 38
+	fma.rn.f32 	%f166, %f161, %f161, %f165;
+	.loc	2 113 22
+	fma.rn.f32 	%f167, %f162, %f166, %f164;
+	.loc	2 108 21
+	sub.f32 	%f168, %f70, %f163;
+	.loc	2 110 60
+	div.full.f32 %r137, %r111, %r118;
+	mov.b32 	%f169, %r137;
+	.loc	2 112 17
+	fma.rn.f32 	%f170, %f169, %f168, %f163;
+	.loc	2 113 15
+	add.f32 	%f171, %f102, %f167;
+	.loc	2 113 30
+	mul.f32 	%f172, %f168, %f168;
+	.loc	2 113 38
+	mul.f32 	%f173, %f172, 0f40400000;
+	.loc	2 113 22
+	fma.rn.f32 	%f174, %f169, %f173, %f171;
+	.loc	2 108 21
+	sub.f32 	%f175, %f71, %f170;
+	.loc	2 110 60
+	div.full.f32 %r140, %r111, %r121;
+	mov.b32 	%f176, %r140;
+	.loc	2 112 17
+	fma.rn.f32 	%f177, %f176, %f175, %f170;
+	.loc	2 113 15
+	add.f32 	%f178, %f103, %f174;
+	.loc	2 113 30
+	mul.f32 	%f179, %f175, %f175;
+	.loc	2 113 38
+	mul.f32 	%f180, %f179, 0f40800000;
+	.loc	2 113 22
+	fma.rn.f32 	%f181, %f176, %f180, %f178;
+	.loc	2 108 21
+	sub.f32 	%f182, %f72, %f177;
+	.loc	2 110 60
+	div.full.f32 %r143, %r111, %r124;
+	mov.b32 	%f183, %r143;
+	.loc	2 112 17
+	fma.rn.f32 	%f184, %f183, %f182, %f177;
+	.loc	2 113 15
+	add.f32 	%f185, %f104, %f181;
+	.loc	2 113 30
+	mul.f32 	%f186, %f182, %f182;
+	.loc	2 113 38
+	mul.f32 	%f187, %f186, 0f40A00000;
+	.loc	2 113 22
+	fma.rn.f32 	%f188, %f183, %f187, %f185;
+	.loc	2 108 21
+	sub.f32 	%f189, %f73, %f184;
+	.loc	2 110 60
+	div.full.f32 %r146, %r111, %r127;
+	mov.b32 	%f190, %r146;
+	.loc	2 112 17
+	fma.rn.f32 	%f191, %f190, %f189, %f184;
+	.loc	2 113 15
+	add.f32 	%f192, %f105, %f188;
+	.loc	2 113 30
+	mul.f32 	%f193, %f189, %f189;
+	.loc	2 113 38
+	mul.f32 	%f194, %f193, 0f40C00000;
+	.loc	2 113 22
+	fma.rn.f32 	%f195, %f190, %f194, %f192;
+	.loc	2 108 21
+	sub.f32 	%f196, %f74, %f191;
+	.loc	2 110 60
+	div.full.f32 %r149, %r111, %r130;
+	mov.b32 	%f197, %r149;
+	.loc	2 112 17
+	fma.rn.f32 	%f198, %f197, %f196, %f191;
+	.loc	2 113 15
+	add.f32 	%f199, %f106, %f195;
+	.loc	2 113 30
+	mul.f32 	%f200, %f196, %f196;
+	.loc	2 113 38
+	mul.f32 	%f201, %f200, 0f40E00000;
+	.loc	2 113 22
+	fma.rn.f32 	%f202, %f197, %f201, %f199;
+$L__tmp3:
+	.loc	2 120 46
+	mov.b32 	%r216, %f150;
+	shfl.sync.bfly.b32	%r217, %r216, 16, 31, -1;
+	mov.b32 	%f203, %r217;
+	mov.b32 	%r218, %f154;
+	shfl.sync.bfly.b32	%r219, %r218, 16, 31, -1;
+	mov.b32 	%f204, %r219;
+	shfl.sync.bfly.b32	%r153, %r130, 16, 31, -1;
+	mov.b32 	%f205, %r153;
+$L__tmp4:
+	.loc	2 108 21
+	sub.f32 	%f206, %f203, %f150;
+	.loc	2 109 28
+	add.f32 	%f207, %f205, 0f41000000;
+	.loc	2 110 39
+	setp.eq.f32 	%p85, %f207, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r154, %f207;
+	div.full.f32 %r152, %r153, %r154;
+	mov.b32 	%f208, %r152;
+	.loc	2 110 49
+	selp.f32 	%f209, 0f00000000, %f208, %p85;
+	.loc	2 112 17
+	fma.rn.f32 	%f210, %f209, %f206, %f150;
+	.loc	2 113 15
+	add.f32 	%f211, %f154, %f204;
+	.loc	2 113 30
+	mul.f32 	%f212, %f206, %f206;
+	.loc	2 113 38
+	mul.f32 	%f213, %f212, 0f41000000;
+	.loc	2 113 22
+	fma.rn.f32 	%f214, %f209, %f213, %f211;
+$L__tmp5:
+	.loc	2 120 46
+	mov.b32 	%r220, %f210;
+	shfl.sync.bfly.b32	%r221, %r220, 8, 31, -1;
+	mov.b32 	%f215, %r221;
+	mov.b32 	%r222, %f214;
+	shfl.sync.bfly.b32	%r223, %r222, 8, 31, -1;
+	mov.b32 	%f216, %r223;
+	shfl.sync.bfly.b32	%r156, %r154, 8, 31, -1;
+	mov.b32 	%f217, %r156;
+$L__tmp6:
+	.loc	2 108 21
+	sub.f32 	%f218, %f215, %f210;
+	.loc	2 109 28
+	add.f32 	%f219, %f207, %f217;
+	.loc	2 110 39
+	setp.eq.f32 	%p86, %f219, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r157, %f219;
+	div.full.f32 %r155, %r156, %r157;
+	mov.b32 	%f220, %r155;
+	.loc	2 110 49
+	selp.f32 	%f221, 0f00000000, %f220, %p86;
+	.loc	2 112 17
+	fma.rn.f32 	%f222, %f221, %f218, %f210;
+	.loc	2 113 15
+	add.f32 	%f223, %f214, %f216;
+	.loc	2 113 30
+	mul.f32 	%f224, %f218, %f218;
+	.loc	2 113 38
+	mul.f32 	%f225, %f207, %f224;
+	.loc	2 113 22
+	fma.rn.f32 	%f226, %f221, %f225, %f223;
+$L__tmp7:
+	.loc	2 120 46
+	mov.b32 	%r224, %f222;
+	shfl.sync.bfly.b32	%r225, %r224, 4, 31, -1;
+	mov.b32 	%f227, %r225;
+	mov.b32 	%r226, %f226;
+	shfl.sync.bfly.b32	%r227, %r226, 4, 31, -1;
+	mov.b32 	%f228, %r227;
+	shfl.sync.bfly.b32	%r159, %r157, 4, 31, -1;
+	mov.b32 	%f229, %r159;
+$L__tmp8:
+	.loc	2 108 21
+	sub.f32 	%f230, %f227, %f222;
+	.loc	2 109 28
+	add.f32 	%f231, %f219, %f229;
+	.loc	2 110 39
+	setp.eq.f32 	%p87, %f231, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r160, %f231;
+	div.full.f32 %r158, %r159, %r160;
+	mov.b32 	%f232, %r158;
+	.loc	2 110 49
+	selp.f32 	%f233, 0f00000000, %f232, %p87;
+	.loc	2 112 17
+	fma.rn.f32 	%f234, %f230, %f233, %f222;
+	.loc	2 113 15
+	add.f32 	%f235, %f226, %f228;
+	.loc	2 113 30
+	mul.f32 	%f236, %f230, %f230;
+	.loc	2 113 38
+	mul.f32 	%f237, %f219, %f236;
+	.loc	2 113 22
+	fma.rn.f32 	%f238, %f233, %f237, %f235;
+$L__tmp9:
+	.loc	2 120 46
+	mov.b32 	%r228, %f234;
+	shfl.sync.bfly.b32	%r229, %r228, 2, 31, -1;
+	mov.b32 	%f239, %r229;
+	mov.b32 	%r230, %f238;
+	shfl.sync.bfly.b32	%r231, %r230, 2, 31, -1;
+	mov.b32 	%f240, %r231;
+	shfl.sync.bfly.b32	%r162, %r160, 2, 31, -1;
+	mov.b32 	%f241, %r162;
+$L__tmp10:
+	.loc	2 108 21
+	sub.f32 	%f242, %f239, %f234;
+	.loc	2 109 28
+	add.f32 	%f17, %f231, %f241;
+	.loc	2 110 39
+	setp.eq.f32 	%p88, %f17, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r163, %f17;
+	div.full.f32 %r161, %r162, %r163;
+	mov.b32 	%f243, %r161;
+	.loc	2 110 49
+	selp.f32 	%f244, 0f00000000, %f243, %p88;
+	.loc	2 112 17
+	fma.rn.f32 	%f18, %f242, %f244, %f234;
+	.loc	2 113 15
+	add.f32 	%f245, %f238, %f240;
+	.loc	2 113 30
+	mul.f32 	%f246, %f242, %f242;
+	.loc	2 113 38
+	mul.f32 	%f247, %f231, %f246;
+	.loc	2 113 22
+	fma.rn.f32 	%f19, %f244, %f247, %f245;
+$L__tmp11:
+	.loc	2 120 46
+	mov.b32 	%r232, %f18;
+	shfl.sync.bfly.b32	%r5, %r232, 1, 31, -1;
+	mov.b32 	%r233, %f19;
+	shfl.sync.bfly.b32	%r6, %r233, 1, 31, -1;
+	shfl.sync.bfly.b32	%r165, %r163, 1, 31, -1;
+	mov.b32 	%f248, %r165;
+$L__tmp12:
+	.loc	2 109 28
+	add.f32 	%f20, %f17, %f248;
+	.loc	2 110 60
+	mov.b32 	%r166, %f20;
+	div.full.f32 %r164, %r165, %r166;
+	mov.b32 	%f21, %r164;
+$L__tmp13:
+	.loc	2 120 46
+	mov.b32 	%r234, %f198;
+	shfl.sync.bfly.b32	%r235, %r234, 16, 31, -1;
+	mov.b32 	%f249, %r235;
+	mov.b32 	%r236, %f202;
+	shfl.sync.bfly.b32	%r237, %r236, 16, 31, -1;
+	mov.b32 	%f250, %r237;
+	shfl.sync.bfly.b32	%r168, %r130, 16, 31, -1;
+	mov.b32 	%f251, %r168;
+$L__tmp14:
+	.loc	2 108 21
+	sub.f32 	%f252, %f249, %f198;
+	.loc	2 109 28
+	add.f32 	%f253, %f251, 0f41000000;
+	.loc	2 110 39
+	setp.eq.f32 	%p89, %f253, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r169, %f253;
+	div.full.f32 %r167, %r168, %r169;
+	mov.b32 	%f254, %r167;
+	.loc	2 110 49
+	selp.f32 	%f255, 0f00000000, %f254, %p89;
+	.loc	2 112 17
+	fma.rn.f32 	%f256, %f252, %f255, %f198;
+	.loc	2 113 15
+	add.f32 	%f257, %f202, %f250;
+	.loc	2 113 30
+	mul.f32 	%f258, %f252, %f252;
+	.loc	2 113 38
+	mul.f32 	%f259, %f258, 0f41000000;
+	.loc	2 113 22
+	fma.rn.f32 	%f260, %f259, %f255, %f257;
+$L__tmp15:
+	.loc	2 120 46
+	mov.b32 	%r238, %f256;
+	shfl.sync.bfly.b32	%r239, %r238, 8, 31, -1;
+	mov.b32 	%f261, %r239;
+	mov.b32 	%r240, %f260;
+	shfl.sync.bfly.b32	%r241, %r240, 8, 31, -1;
+	mov.b32 	%f262, %r241;
+	shfl.sync.bfly.b32	%r171, %r169, 8, 31, -1;
+	mov.b32 	%f263, %r171;
+$L__tmp16:
+	.loc	2 108 21
+	sub.f32 	%f264, %f261, %f256;
+	.loc	2 109 28
+	add.f32 	%f265, %f253, %f263;
+	.loc	2 110 39
+	setp.eq.f32 	%p90, %f265, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r172, %f265;
+	div.full.f32 %r170, %r171, %r172;
+	mov.b32 	%f266, %r170;
+	.loc	2 110 49
+	selp.f32 	%f267, 0f00000000, %f266, %p90;
+	.loc	2 112 17
+	fma.rn.f32 	%f268, %f264, %f267, %f256;
+	.loc	2 113 15
+	add.f32 	%f269, %f260, %f262;
+	.loc	2 113 30
+	mul.f32 	%f270, %f264, %f264;
+	.loc	2 113 38
+	mul.f32 	%f271, %f253, %f270;
+	.loc	2 113 22
+	fma.rn.f32 	%f272, %f267, %f271, %f269;
+$L__tmp17:
+	.loc	2 120 46
+	mov.b32 	%r242, %f268;
+	shfl.sync.bfly.b32	%r243, %r242, 4, 31, -1;
+	mov.b32 	%f273, %r243;
+	mov.b32 	%r244, %f272;
+	shfl.sync.bfly.b32	%r245, %r244, 4, 31, -1;
+	mov.b32 	%f274, %r245;
+	shfl.sync.bfly.b32	%r174, %r172, 4, 31, -1;
+	mov.b32 	%f275, %r174;
+$L__tmp18:
+	.loc	2 108 21
+	sub.f32 	%f276, %f273, %f268;
+	.loc	2 109 28
+	add.f32 	%f277, %f265, %f275;
+	.loc	2 110 39
+	setp.eq.f32 	%p91, %f277, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r175, %f277;
+	div.full.f32 %r173, %r174, %r175;
+	mov.b32 	%f278, %r173;
+	.loc	2 110 49
+	selp.f32 	%f279, 0f00000000, %f278, %p91;
+	.loc	2 112 17
+	fma.rn.f32 	%f280, %f276, %f279, %f268;
+	.loc	2 113 15
+	add.f32 	%f281, %f272, %f274;
+	.loc	2 113 30
+	mul.f32 	%f282, %f276, %f276;
+	.loc	2 113 38
+	mul.f32 	%f283, %f265, %f282;
+	.loc	2 113 22
+	fma.rn.f32 	%f284, %f279, %f283, %f281;
+$L__tmp19:
+	.loc	2 120 46
+	mov.b32 	%r246, %f280;
+	shfl.sync.bfly.b32	%r247, %r246, 2, 31, -1;
+	mov.b32 	%f285, %r247;
+	mov.b32 	%r248, %f284;
+	shfl.sync.bfly.b32	%r249, %r248, 2, 31, -1;
+	mov.b32 	%f286, %r249;
+	shfl.sync.bfly.b32	%r177, %r175, 2, 31, -1;
+	mov.b32 	%f287, %r177;
+$L__tmp20:
+	.loc	2 108 21
+	sub.f32 	%f288, %f285, %f280;
+	.loc	2 109 28
+	add.f32 	%f22, %f277, %f287;
+	.loc	2 110 39
+	setp.eq.f32 	%p92, %f22, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r178, %f22;
+	div.full.f32 %r176, %r177, %r178;
+	mov.b32 	%f289, %r176;
+	.loc	2 110 49
+	selp.f32 	%f290, 0f00000000, %f289, %p92;
+	.loc	2 112 17
+	fma.rn.f32 	%f23, %f288, %f290, %f280;
+	.loc	2 113 15
+	add.f32 	%f291, %f284, %f286;
+	.loc	2 113 30
+	mul.f32 	%f292, %f288, %f288;
+	.loc	2 113 38
+	mul.f32 	%f293, %f277, %f292;
+	.loc	2 113 22
+	fma.rn.f32 	%f24, %f290, %f293, %f291;
+$L__tmp21:
+	.loc	2 120 46
+	mov.b32 	%r250, %f23;
+	shfl.sync.bfly.b32	%r7, %r250, 1, 31, -1;
+	mov.b32 	%r251, %f24;
+	shfl.sync.bfly.b32	%r8, %r251, 1, 31, -1;
+	shfl.sync.bfly.b32	%r180, %r178, 1, 31, -1;
+	mov.b32 	%f294, %r180;
+$L__tmp22:
+	.loc	2 109 28
+	add.f32 	%f25, %f22, %f294;
+	.loc	2 110 60
+	mov.b32 	%r181, %f25;
+	div.full.f32 %r179, %r180, %r181;
+	mov.b32 	%f26, %r179;
+$L__tmp23:
+	.loc	1 59 51
+	mov.u32 %r182, 0x0;
+	mov.u32 %r183, 0x0;
+	mov.u32 %r184, 0x0;
+	mov.u32 %r185, 0x0;
+	@%p93 ld.global.L1::evict_last.v4.b32 { %r182, %r183, %r184, %r185 }, [ %rd80 + 0 ];
+	@!%p93 mov.u32 %r182, %r257;
+	@!%p93 mov.u32 %r183, %r257;
+	@!%p93 mov.u32 %r184, %r257;
+	@!%p93 mov.u32 %r185, %r257;
+	mov.u32 %r190, 0x0;
+	mov.u32 %r191, 0x0;
+	mov.u32 %r192, 0x0;
+	mov.u32 %r193, 0x0;
+	@%p93 ld.global.L1::evict_last.v4.b32 { %r190, %r191, %r192, %r193 }, [ %rd81 + 0 ];
+	@!%p93 mov.u32 %r190, %r257;
+	@!%p93 mov.u32 %r191, %r257;
+	@!%p93 mov.u32 %r192, %r257;
+	@!%p93 mov.u32 %r193, %r257;
+	mov.u32 %r198, 0x0;
+	mov.u32 %r199, 0x0;
+	mov.u32 %r200, 0x0;
+	mov.u32 %r201, 0x0;
+	@%p93 ld.global.L1::evict_last.v4.b32 { %r198, %r199, %r200, %r201 }, [ %rd82 + 0 ];
+	@!%p93 mov.u32 %r198, %r257;
+	@!%p93 mov.u32 %r199, %r257;
+	@!%p93 mov.u32 %r200, %r257;
+	@!%p93 mov.u32 %r201, %r257;
+	mov.u32 %r206, 0x0;
+	mov.u32 %r207, 0x0;
+	mov.u32 %r208, 0x0;
+	mov.u32 %r209, 0x0;
+	@%p93 ld.global.L1::evict_last.v4.b32 { %r206, %r207, %r208, %r209 }, [ %rd83 + 0 ];
+	@!%p93 mov.u32 %r206, %r257;
+	@!%p93 mov.u32 %r207, %r257;
+	@!%p93 mov.u32 %r208, %r257;
+	@!%p93 mov.u32 %r209, %r257;
+	.loc	1 60 35
+	mul.wide.u32 	%rd96, %r2, 4;
+	add.s64 	%rd84, %rd13, %rd96;
+	.loc	1 60 40
+	mov.u32 %r214, 0x0;
+	@%p93 ld.global.L1::evict_last.b32 { %r214 }, [ %rd84 + 0 ];
+	@!%p93 mov.u32 %r214, %r257;
+	.loc	1 64 57
+	@%p39 bra 	$L__BB0_4;
+	mov.u64 	%rd97, assertMessage_1;
+	cvta.global.u64 	%rd98, %rd97;
+	mov.u64 	%rd99, assertFile_1;
+	cvta.global.u64 	%rd100, %rd99;
+	mov.u64 	%rd101, assertFunc_1;
+	cvta.global.u64 	%rd102, %rd101;
+	{ // callseq 9, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd98;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd100;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r374;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd102;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd112;
+	call.uni
+	__assertfail,
+	(
+	param0,
+	param1,
+	param2,
+	param3,
+	param4
+	);
+	} // callseq 9
+$L__BB0_4:
+$L__tmp24:
+	.loc	2 120 46
+	mov.b32 	%f295, %r8;
+$L__tmp25:
+	.loc	2 113 15
+	add.f32 	%f296, %f24, %f295;
+$L__tmp26:
+	.loc	2 120 46
+	mov.b32 	%f297, %r7;
+$L__tmp27:
+	.loc	2 108 21
+	sub.f32 	%f298, %f297, %f23;
+	.loc	2 113 30
+	mul.f32 	%f299, %f298, %f298;
+	.loc	2 113 38
+	mul.f32 	%f300, %f22, %f299;
+	.loc	2 110 39
+	setp.eq.f32 	%p115, %f25, 0f00000000;
+	.loc	2 110 49
+	selp.f32 	%f301, 0f00000000, %f26, %p115;
+	.loc	2 113 22
+	fma.rn.f32 	%f302, %f301, %f300, %f296;
+$L__tmp28:
+	.loc	2 120 46
+	mov.b32 	%f303, %r6;
+$L__tmp29:
+	.loc	2 113 15
+	add.f32 	%f304, %f19, %f303;
+$L__tmp30:
+	.loc	2 120 46
+	mov.b32 	%f305, %r5;
+$L__tmp31:
+	.loc	2 108 21
+	sub.f32 	%f306, %f305, %f18;
+	.loc	2 113 30
+	mul.f32 	%f307, %f306, %f306;
+	.loc	2 113 38
+	mul.f32 	%f308, %f17, %f307;
+	.loc	2 110 39
+	setp.eq.f32 	%p116, %f20, 0f00000000;
+	.loc	2 110 49
+	selp.f32 	%f309, 0f00000000, %f21, %p116;
+	.loc	2 113 22
+	fma.rn.f32 	%f310, %f309, %f308, %f304;
+$L__tmp32:
+	.loc	1 65 54
+	mov.u32 %r253, 0x0;
+	mov.u32 %r254, 0x0;
+	mov.u32 %r255, 0x0;
+	mov.u32 %r256, 0x0;
+	@%p93 ld.global.L1::evict_first.v4.b32 { %r253, %r254, %r255, %r256 }, [ %rd104 + 0 ];
+	@!%p93 mov.u32 %r253, %r257;
+	@!%p93 mov.u32 %r254, %r257;
+	@!%p93 mov.u32 %r255, %r257;
+	@!%p93 mov.u32 %r256, %r257;
+	mov.u32 %r261, 0x0;
+	mov.u32 %r262, 0x0;
+	mov.u32 %r263, 0x0;
+	mov.u32 %r264, 0x0;
+	@%p93 ld.global.L1::evict_first.v4.b32 { %r261, %r262, %r263, %r264 }, [ %rd105 + 0 ];
+	@!%p93 mov.u32 %r261, %r257;
+	@!%p93 mov.u32 %r262, %r257;
+	@!%p93 mov.u32 %r263, %r257;
+	@!%p93 mov.u32 %r264, %r257;
+	mov.u32 %r269, 0x0;
+	mov.u32 %r270, 0x0;
+	mov.u32 %r271, 0x0;
+	mov.u32 %r272, 0x0;
+	@%p93 ld.global.L1::evict_first.v4.b32 { %r269, %r270, %r271, %r272 }, [ %rd106 + 0 ];
+	@!%p93 mov.u32 %r269, %r257;
+	@!%p93 mov.u32 %r270, %r257;
+	@!%p93 mov.u32 %r271, %r257;
+	@!%p93 mov.u32 %r272, %r257;
+	mov.u32 %r277, 0x0;
+	mov.u32 %r278, 0x0;
+	mov.u32 %r279, 0x0;
+	mov.u32 %r280, 0x0;
+	@%p93 ld.global.L1::evict_first.v4.b32 { %r277, %r278, %r279, %r280 }, [ %rd107 + 0 ];
+	@!%p93 mov.u32 %r277, %r257;
+	@!%p93 mov.u32 %r278, %r257;
+	@!%p93 mov.u32 %r279, %r257;
+	@!%p93 mov.u32 %r280, %r257;
+	.loc	1 69 23
+	mov.b32 	%r286, %f310;
+	mov.b32 	%r287, 1132462080;
+	div.full.f32 %r285, %r286, %r287;
+	mov.b32 	%f311, %r285;
+	mov.b32 	%r310, %f302;
+	div.full.f32 %r309, %r310, %r287;
+	mov.b32 	%f312, %r309;
+	.loc	1 71 24
+	add.f32 	%f313, %f311, 0f3727C5AC;
+	add.f32 	%f314, %f312, 0f3727C5AC;
+	.loc	1 72 30
+	rsqrt.approx.ftz.f32 	%f315, %f313;
+	rsqrt.approx.ftz.f32 	%f316, %f314;
+	.loc	1 65 54
+	mov.b32 	%f317, %r280;
+	.loc	1 59 51
+	mov.b32 	%f318, %r209;
+	.loc	1 66 24
+	add.f32 	%f319, %f318, %f317;
+$L__tmp33:
+	.loc	2 112 17
+	fma.rn.f32 	%f320, %f298, %f301, %f23;
+$L__tmp34:
+	.loc	1 67 24
+	sub.f32 	%f321, %f319, %f320;
+	.loc	1 65 54
+	mov.b32 	%f322, %r279;
+	.loc	1 59 51
+	mov.b32 	%f323, %r208;
+	.loc	1 66 24
+	add.f32 	%f324, %f323, %f322;
+	.loc	1 67 24
+	sub.f32 	%f325, %f324, %f320;
+	.loc	1 65 54
+	mov.b32 	%f326, %r278;
+	.loc	1 59 51
+	mov.b32 	%f327, %r207;
+	.loc	1 66 24
+	add.f32 	%f328, %f327, %f326;
+	.loc	1 67 24
+	sub.f32 	%f329, %f328, %f320;
+	.loc	1 65 54
+	mov.b32 	%f330, %r277;
+	.loc	1 59 51
+	mov.b32 	%f331, %r206;
+	.loc	1 66 24
+	add.f32 	%f332, %f331, %f330;
+	.loc	1 67 24
+	sub.f32 	%f333, %f332, %f320;
+	.loc	1 65 54
+	mov.b32 	%f334, %r272;
+	.loc	1 59 51
+	mov.b32 	%f335, %r201;
+	.loc	1 66 24
+	add.f32 	%f336, %f335, %f334;
+	.loc	1 67 24
+	sub.f32 	%f337, %f336, %f320;
+	.loc	1 65 54
+	mov.b32 	%f338, %r271;
+	.loc	1 59 51
+	mov.b32 	%f339, %r200;
+	.loc	1 66 24
+	add.f32 	%f340, %f339, %f338;
+	.loc	1 67 24
+	sub.f32 	%f341, %f340, %f320;
+	.loc	1 65 54
+	mov.b32 	%f342, %r270;
+	.loc	1 59 51
+	mov.b32 	%f343, %r199;
+	.loc	1 66 24
+	add.f32 	%f344, %f343, %f342;
+	.loc	1 67 24
+	sub.f32 	%f345, %f344, %f320;
+	.loc	1 65 54
+	mov.b32 	%f346, %r269;
+	.loc	1 59 51
+	mov.b32 	%f347, %r198;
+	.loc	1 66 24
+	add.f32 	%f348, %f347, %f346;
+	.loc	1 67 24
+	sub.f32 	%f349, %f348, %f320;
+	.loc	1 65 54
+	mov.b32 	%f350, %r264;
+	.loc	1 59 51
+	mov.b32 	%f351, %r193;
+	.loc	1 66 24
+	add.f32 	%f352, %f351, %f350;
+$L__tmp35:
+	.loc	2 112 17
+	fma.rn.f32 	%f353, %f306, %f309, %f18;
+$L__tmp36:
+	.loc	1 67 24
+	sub.f32 	%f354, %f352, %f353;
+	.loc	1 65 54
+	mov.b32 	%f355, %r263;
+	.loc	1 59 51
+	mov.b32 	%f356, %r192;
+	.loc	1 66 24
+	add.f32 	%f357, %f356, %f355;
+	.loc	1 67 24
+	sub.f32 	%f358, %f357, %f353;
+	.loc	1 65 54
+	mov.b32 	%f359, %r262;
+	.loc	1 59 51
+	mov.b32 	%f360, %r191;
+	.loc	1 66 24
+	add.f32 	%f361, %f360, %f359;
+	.loc	1 67 24
+	sub.f32 	%f362, %f361, %f353;
+	.loc	1 65 54
+	mov.b32 	%f363, %r261;
+	.loc	1 59 51
+	mov.b32 	%f364, %r190;
+	.loc	1 66 24
+	add.f32 	%f365, %f364, %f363;
+	.loc	1 67 24
+	sub.f32 	%f366, %f365, %f353;
+	.loc	1 65 54
+	mov.b32 	%f367, %r256;
+	.loc	1 59 51
+	mov.b32 	%f368, %r185;
+	.loc	1 66 24
+	add.f32 	%f369, %f368, %f367;
+	.loc	1 67 24
+	sub.f32 	%f370, %f369, %f353;
+	.loc	1 65 54
+	mov.b32 	%f371, %r255;
+	.loc	1 59 51
+	mov.b32 	%f372, %r184;
+	.loc	1 66 24
+	add.f32 	%f373, %f372, %f371;
+	.loc	1 67 24
+	sub.f32 	%f374, %f373, %f353;
+	.loc	1 65 54
+	mov.b32 	%f375, %r254;
+	.loc	1 59 51
+	mov.b32 	%f376, %r183;
+	.loc	1 66 24
+	add.f32 	%f377, %f376, %f375;
+	.loc	1 67 24
+	sub.f32 	%f378, %f377, %f353;
+	.loc	1 65 54
+	mov.b32 	%f379, %r253;
+	.loc	1 59 51
+	mov.b32 	%f380, %r182;
+	.loc	1 66 24
+	add.f32 	%f381, %f380, %f379;
+	.loc	1 67 24
+	sub.f32 	%f382, %f381, %f353;
+	.loc	1 73 24
+	mul.f32 	%f383, %f382, %f315;
+	mul.f32 	%f384, %f378, %f315;
+	mul.f32 	%f385, %f374, %f315;
+	mul.f32 	%f386, %f370, %f315;
+	mul.f32 	%f387, %f366, %f315;
+	mul.f32 	%f388, %f362, %f315;
+	mul.f32 	%f389, %f358, %f315;
+	mul.f32 	%f390, %f354, %f315;
+	mul.f32 	%f391, %f349, %f316;
+	mul.f32 	%f392, %f345, %f316;
+	mul.f32 	%f393, %f341, %f316;
+	mul.f32 	%f394, %f337, %f316;
+	mul.f32 	%f395, %f333, %f316;
+	mul.f32 	%f396, %f329, %f316;
+	mul.f32 	%f397, %f325, %f316;
+	mul.f32 	%f398, %f321, %f316;
+	.loc	1 74 24
+	shl.b32 	%r357, %r2, 2;
+	mov.u32 	%r358, global_smem;
+	add.s32 	%r359, %r358, %r357;
+	st.shared.u32 	[%r359], %r214;
+	bar.sync 	0;
+	shl.b32 	%r360, %r1, 2;
+	add.s32 	%r361, %r358, %r360;
+	ld.shared.v4.f32 	{%f399, %f400, %f401, %f402}, [%r361];
+	ld.shared.v4.f32 	{%f403, %f404, %f405, %f406}, [%r361+16];
+	mul.f32 	%f407, %f383, %f399;
+	mul.f32 	%f408, %f384, %f400;
+	mul.f32 	%f409, %f385, %f401;
+	mul.f32 	%f410, %f386, %f402;
+	mul.f32 	%f411, %f387, %f403;
+	mul.f32 	%f412, %f388, %f404;
+	mul.f32 	%f413, %f389, %f405;
+	mul.f32 	%f414, %f390, %f406;
+	mul.f32 	%f415, %f391, %f399;
+	mul.f32 	%f416, %f392, %f400;
+	mul.f32 	%f417, %f393, %f401;
+	mul.f32 	%f418, %f394, %f402;
+	mul.f32 	%f419, %f395, %f403;
+	mul.f32 	%f420, %f396, %f404;
+	mul.f32 	%f421, %f397, %f405;
+	mul.f32 	%f422, %f398, %f406;
+	.loc	1 76 39
+	shl.b32 	%r362, %r3, 8;
+	shl.b32 	%r363, %r4, 8;
+	.loc	1 76 35
+	or.b32  	%r364, %r362, %r1;
+	or.b32  	%r365, %r363, %r1;
+	.loc	1 76 29
+	mul.wide.s32 	%rd110, %r364, 2;
+	add.s64 	%rd108, %rd14, %rd110;
+	mul.wide.s32 	%rd111, %r365, 2;
+	add.s64 	%rd109, %rd14, %rd111;
+	.loc	1 76 52
+	mov.b32 	%r333, %f407;
+	cvt.rn.bf16.f32 %rs1, %r333;
+	mov.b32 	%r334, %f408;
+	cvt.rn.bf16.f32 %rs2, %r334;
+	mov.b32 	%r335, %f409;
+	cvt.rn.bf16.f32 %rs3, %r335;
+	mov.b32 	%r336, %f410;
+	cvt.rn.bf16.f32 %rs4, %r336;
+	mov.b32 	%r337, %f411;
+	cvt.rn.bf16.f32 %rs5, %r337;
+	mov.b32 	%r338, %f412;
+	cvt.rn.bf16.f32 %rs6, %r338;
+	mov.b32 	%r339, %f413;
+	cvt.rn.bf16.f32 %rs7, %r339;
+	mov.b32 	%r340, %f414;
+	cvt.rn.bf16.f32 %rs8, %r340;
+	mov.b32 	%r341, %f415;
+	cvt.rn.bf16.f32 %rs9, %r341;
+	mov.b32 	%r342, %f416;
+	cvt.rn.bf16.f32 %rs10, %r342;
+	mov.b32 	%r343, %f417;
+	cvt.rn.bf16.f32 %rs11, %r343;
+	mov.b32 	%r344, %f418;
+	cvt.rn.bf16.f32 %rs12, %r344;
+	mov.b32 	%r345, %f419;
+	cvt.rn.bf16.f32 %rs13, %r345;
+	mov.b32 	%r346, %f420;
+	cvt.rn.bf16.f32 %rs14, %r346;
+	mov.b32 	%r347, %f421;
+	cvt.rn.bf16.f32 %rs15, %r347;
+	mov.b32 	%r348, %f422;
+	cvt.rn.bf16.f32 %rs16, %r348;
+	mov.b32 	%r366, {%rs1, %rs2};
+	mov.b32 	%r367, {%rs3, %rs4};
+	mov.b32 	%r368, {%rs5, %rs6};
+	mov.b32 	%r369, {%rs7, %rs8};
+	@%p93 st.global.v4.b32 [ %rd108 + 0 ], { %r366, %r367, %r368, %r369 };
+	mov.b32 	%r370, {%rs9, %rs10};
+	mov.b32 	%r371, {%rs11, %rs12};
+	mov.b32 	%r372, {%rs13, %rs14};
+	mov.b32 	%r373, {%rs15, %rs16};
+	@%p93 st.global.v4.b32 [ %rd109 + 0 ], { %r370, %r371, %r372, %r373 };
+	.loc	1 55 4
+	ret;
+$L__tmp37:
+$L__func_end0:
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+}
+	.file	1 "/tmp/torchinductor_root/lh/clhe4a3stvufxafmq3kk5hodazz2efctffte646znjdnv3lqi5oa.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 298
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 108
+.b8 104
+.b8 101
+.b8 52
+.b8 97
+.b8 51
+.b8 115
+.b8 116
+.b8 118
+.b8 117
+.b8 102
+.b8 120
+.b8 97
+.b8 102
+.b8 109
+.b8 113
+.b8 51
+.b8 107
+.b8 107
+.b8 53
+.b8 104
+.b8 111
+.b8 100
+.b8 97
+.b8 122
+.b8 122
+.b8 50
+.b8 101
+.b8 102
+.b8 99
+.b8 116
+.b8 102
+.b8 102
+.b8 116
+.b8 101
+.b8 54
+.b8 52
+.b8 54
+.b8 122
+.b8 110
+.b8 106
+.b8 100
+.b8 110
+.b8 118
+.b8 51
+.b8 108
+.b8 113
+.b8 105
+.b8 53
+.b8 111
+.b8 97
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 108
+.b8 104
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp2
+.b8 2
+.b8 44
+.b8 38
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp36
+.b8 2
+.b8 50
+.b8 41
+.b8 4
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp36
+.b8 2
+.b8 120
+.b8 46
+.b8 0
+.b8 4
+.b32 125
+.b64 $L__tmp3
+.b64 $L__tmp31
+.b8 2
+.b8 50
+.b8 41
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 302
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 302
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}

.triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.ttir ADDED Viewed

	@@ -0,0 +1,104 @@

+module {
+  tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<1.000000e+00> : tensor<1x256xf32>
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x256xf32>
+    %cst_1 = arith.constant 0.000000e+00 : f32
+    %cst_2 = arith.constant dense<256> : tensor<16x1xi64>
+    %cst_3 = arith.constant dense<50257> : tensor<16x1xi64>
+    %cst_4 = arith.constant dense<0> : tensor<16x1xi64>
+    %cst_5 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32>
+    %cst_6 = arith.constant dense<2.560000e+02> : tensor<16x1xf32>
+    %cst_7 = arith.constant dense<0.000000e+00> : tensor<16x256xf32>
+    %cst_8 = arith.constant dense<256> : tensor<16x1xi32>
+    %cst_9 = arith.constant dense<256> : tensor<1x256xi32>
+    %cst_10 = arith.constant dense<512> : tensor<16x1xi32>
+    %c16_i32 = arith.constant 16 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c16_i32 : i32
+    %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32>
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32>
+    %4 = tt.splat %1 : (i32) -> tensor<16x1xi32>
+    %5 = arith.addi %4, %3 : tensor<16x1xi32>
+    %6 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
+    %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<256xi32>) -> tensor<1x256xi32>
+    %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>>
+    %9 = tt.addptr %8, %5 : tensor<16x1x!tt.ptr<i64, 1>>, tensor<16x1xi32>
+    %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64>
+    %11 = arith.remsi %5, %cst_10 : tensor<16x1xi32>
+    %12 = arith.cmpi slt, %7, %cst_9 : tensor<1x256xi32>
+    %13 = arith.muli %11, %cst_8 : tensor<16x1xi32>
+    %14 = tt.broadcast %7 : (tensor<1x256xi32>) -> tensor<16x256xi32>
+    %15 = tt.broadcast %13 : (tensor<16x1xi32>) -> tensor<16x256xi32>
+    %16 = arith.addi %14, %15 : tensor<16x256xi32>
+    %17 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>>
+    %18 = tt.addptr %17, %16 : tensor<16x256x!tt.ptr<f32, 1>>, tensor<16x256xi32>
+    %19 = tt.broadcast %12 : (tensor<1x256xi1>) -> tensor<16x256xi1>
+    %20 = tt.load %18, %19, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32>
+    %21 = arith.addi %10, %cst_3 : tensor<16x1xi64>
+    %22 = arith.cmpi slt, %10, %cst_4 : tensor<16x1xi64>
+    %23 = arith.select %22, %21, %10 : tensor<16x1xi1>, tensor<16x1xi64>
+    %24 = arith.cmpi sge, %23, %cst_4 : tensor<16x1xi64>
+    %25 = arith.cmpi slt, %23, %cst_3 : tensor<16x1xi64>
+    %26 = arith.andi %24, %25 : tensor<16x1xi1>
+    tt.assert %26, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1>
+    %27 = arith.muli %23, %cst_2 : tensor<16x1xi64>
+    %28 = tt.broadcast %27 : (tensor<16x1xi64>) -> tensor<16x256xi64>
+    %29 = arith.extsi %7 : tensor<1x256xi32> to tensor<1x256xi64>
+    %30 = tt.broadcast %29 : (tensor<1x256xi64>) -> tensor<16x256xi64>
+    %31 = arith.addi %30, %28 : tensor<16x256xi64>
+    %32 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>>
+    %33 = tt.addptr %32, %31 : tensor<16x256x!tt.ptr<f32, 1>>, tensor<16x256xi64>
+    %34 = tt.load %33, %19, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32>
+    %35 = arith.addf %34, %20 : tensor<16x256xf32>
+    %36 = arith.addf %35, %cst_7 : tensor<16x256xf32>
+    %37 = arith.subf %35, %36 : tensor<16x256xf32>
+    %38 = arith.mulf %35, %37 : tensor<16x256xf32>
+    %39 = arith.addf %38, %cst_7 : tensor<16x256xf32>
+    %40 = arith.select %19, %36, %cst_7 : tensor<16x256xi1>, tensor<16x256xf32>
+    %41 = arith.select %19, %39, %cst_7 : tensor<16x256xi1>, tensor<16x256xf32>
+    %42 = arith.select %12, %cst, %cst_0 : tensor<1x256xi1>, tensor<1x256xf32>
+    %43 = tt.broadcast %42 : (tensor<1x256xf32>) -> tensor<16x256xf32>
+    %44:3 = "tt.reduce"(%40, %41, %43) <{axis = 1 : i32}> ({
+    ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
+      %68 = arith.subf %arg10, %arg7 : f32
+      %69 = arith.addf %arg9, %arg12 : f32
+      %70 = arith.cmpf oeq, %69, %cst_1 : f32
+      %71 = arith.divf %arg12, %69 : f32
+      %72 = arith.select %70, %cst_1, %71 : f32
+      %73 = arith.mulf %68, %72 : f32
+      %74 = arith.addf %arg7, %73 : f32
+      %75 = arith.addf %arg8, %arg11 : f32
+      %76 = arith.mulf %68, %68 : f32
+      %77 = arith.mulf %76, %arg9 : f32
+      %78 = arith.mulf %77, %72 : f32
+      %79 = arith.addf %75, %78 : f32
+      tt.reduce.return %74, %79, %69 : f32, f32, f32
+    }) : (tensor<16x256xf32>, tensor<16x256xf32>, tensor<16x256xf32>) -> (tensor<16xf32>, tensor<16xf32>, tensor<16xf32>)
+    %45 = tt.expand_dims %44#0 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32>
+    %46 = tt.expand_dims %44#1 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32>
+    %47 = tt.load %18, %19, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32>
+    %48 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>>
+    %49 = tt.addptr %48, %7 : tensor<1x256x!tt.ptr<f32, 1>>, tensor<1x256xi32>
+    %50 = tt.load %49, %12, %cst_0 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
+    tt.assert %26, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1>
+    %51 = tt.load %33, %19, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x256xf32>
+    %52 = arith.addf %51, %47 : tensor<16x256xf32>
+    %53 = tt.broadcast %45 : (tensor<16x1xf32>) -> tensor<16x256xf32>
+    %54 = arith.subf %52, %53 : tensor<16x256xf32>
+    %55 = arith.divf %46, %cst_6 : tensor<16x1xf32>
+    %56 = arith.addf %55, %cst_5 : tensor<16x1xf32>
+    %57 = tt.extern_elementwise %56 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32>) -> tensor<16x1xf32>
+    %58 = tt.broadcast %57 : (tensor<16x1xf32>) -> tensor<16x256xf32>
+    %59 = arith.mulf %54, %58 : tensor<16x256xf32>
+    %60 = tt.broadcast %50 : (tensor<1x256xf32>) -> tensor<16x256xf32>
+    %61 = arith.mulf %59, %60 : tensor<16x256xf32>
+    %62 = arith.muli %5, %cst_8 : tensor<16x1xi32>
+    %63 = tt.broadcast %62 : (tensor<16x1xi32>) -> tensor<16x256xi32>
+    %64 = arith.addi %14, %63 : tensor<16x256xi32>
+    %65 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<16x256x!tt.ptr<bf16, 1>>
+    %66 = tt.addptr %65, %64 : tensor<16x256x!tt.ptr<bf16, 1>>, tensor<16x256xi32>
+    %67 = arith.truncf %61 : tensor<16x256xf32> to tensor<16x256xbf16>
+    tt.store %66, %67, %19 {cache = 1 : i32, evict = 1 : i32} : tensor<16x256xbf16>
+    tt.return
+  }
+}

.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.ptx ADDED Viewed

	@@ -0,0 +1,465 @@

+//
+// Generated by LLVM NVPTX Back-End
+//
+.version 8.2
+.target sm_89
+.address_size 64
+	// .globl	triton__0d1d2d34e
+.visible .entry triton__0d1d2d34e(
+	.param .u64 triton__0d1d2d34e_param_0,
+	.param .u64 triton__0d1d2d34e_param_1,
+	.param .u64 triton__0d1d2d34e_param_2,
+	.param .u32 triton__0d1d2d34e_param_3,
+	.param .u32 triton__0d1d2d34e_param_4
+)
+.maxntid 64, 1, 1
+{
+	.reg .pred 	%p<6>;
+	.reg .b32 	%r<27>;
+	.reg .f32 	%f<9>;
+	.reg .b64 	%rd<24>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+	ld.param.u64 	%rd4, [triton__0d1d2d34e_param_0];
+	ld.param.u64 	%rd5, [triton__0d1d2d34e_param_1];
+$L__tmp0:
+	.loc	1 25 34
+	mov.u32 	%r7, %tid.x;
+	and.b32  	%r8, %r7, 7;
+	ld.param.u64 	%rd6, [triton__0d1d2d34e_param_2];
+	.loc	1 28 30
+	mul.wide.u32 	%rd7, %r8, 4;
+	add.s64 	%rd1, %rd5, %rd7;
+	mov.b32 	%r2, 0;
+	mov.pred 	%p1, -1;
+	.loc	1 28 35
+	mov.u32 %r1, 0x0;
+	@%p1 ld.global.b32 { %r1 }, [ %rd1 + 0 ];
+	@!%p1 mov.u32 %r1, %r2;
+	mov.b32 	%f1, %r1;
+	.loc	1 29 30
+	mul.wide.u32 	%rd8, %r8, 8;
+	add.s64 	%rd3, %rd6, %rd8;
+	.loc	1 29 35
+	mov.u64 %rd2, 0x0;
+	@%p1 ld.global.b64 { %rd2 }, [ %rd3 + 0 ];
+	@!%p1 mov.u64 %rd2, 0x0;
+$L__tmp1:
+	.loc	2 243 36
+	shfl.sync.bfly.b32	%r9, %r1, 4, 31, -1;
+	mov.b32 	%f2, %r9;
+$L__tmp2:
+	.loc	2 233 15
+	add.f32 	%f3, %f1, %f2;
+$L__tmp3:
+	.loc	2 243 36
+	mov.b32 	%r10, %f3;
+	shfl.sync.bfly.b32	%r11, %r10, 2, 31, -1;
+	mov.b32 	%f4, %r11;
+$L__tmp4:
+	.loc	2 233 15
+	add.f32 	%f5, %f3, %f4;
+$L__tmp5:
+	.loc	2 243 36
+	mov.b32 	%r12, %f5;
+	shfl.sync.bfly.b32	%r13, %r12, 1, 31, -1;
+	mov.b32 	%f6, %r13;
+$L__tmp6:
+	.loc	2 233 15
+	add.f32 	%f7, %f5, %f6;
+$L__tmp7:
+	.loc	2 243 36
+	cvt.u32.u64 	%r14, %rd2;
+	shfl.sync.bfly.b32	%r15, %r14, 4, 31, -1;
+	{ .reg .b32 tmp; mov.b64 {tmp, %r16}, %rd2; }
+	shfl.sync.bfly.b32	%r17, %r16, 4, 31, -1;
+	cvt.u64.u32 	%rd9, %r15;
+	cvt.u64.u32 	%rd10, %r17;
+	shl.b64 	%rd11, %rd10, 32;
+	or.b64  	%rd12, %rd9, %rd11;
+$L__tmp8:
+	.loc	2 233 15
+	add.s64 	%rd13, %rd2, %rd12;
+$L__tmp9:
+	.loc	2 243 36
+	cvt.u32.u64 	%r18, %rd13;
+	shfl.sync.bfly.b32	%r19, %r18, 2, 31, -1;
+	{ .reg .b32 tmp; mov.b64 {tmp, %r20}, %rd13; }
+	shfl.sync.bfly.b32	%r21, %r20, 2, 31, -1;
+	cvt.u64.u32 	%rd14, %r19;
+	cvt.u64.u32 	%rd15, %r21;
+	shl.b64 	%rd16, %rd15, 32;
+	or.b64  	%rd17, %rd14, %rd16;
+$L__tmp10:
+	.loc	2 233 15
+	add.s64 	%rd18, %rd13, %rd17;
+$L__tmp11:
+	.loc	2 243 36
+	cvt.u32.u64 	%r22, %rd18;
+	shfl.sync.bfly.b32	%r23, %r22, 1, 31, -1;
+	{ .reg .b32 tmp; mov.b64 {tmp, %r24}, %rd18; }
+	shfl.sync.bfly.b32	%r25, %r24, 1, 31, -1;
+	cvt.u64.u32 	%rd19, %r23;
+	cvt.u64.u32 	%rd20, %r25;
+	shl.b64 	%rd21, %rd20, 32;
+	or.b64  	%rd22, %rd19, %rd21;
+$L__tmp12:
+	.loc	2 233 15
+	add.s64 	%rd23, %rd18, %rd22;
+$L__tmp13:
+	.loc	1 36 20
+	cvt.rn.f32.s64 	%f8, %rd23;
+	.loc	1 37 19
+	mov.b32 	%r4, %f7;
+	mov.b32 	%r5, %f8;
+	div.full.f32 %r6, %r4, %r5;
+	.loc	1 38 4
+	bar.sync 	0;
+	.loc	1 39 71
+	and.b32  	%r26, %r7, 63;
+	setp.eq.s32 	%p5, %r26, 0;
+	@%p5 st.global.b32 [ %rd4 + 0 ], { %r6 };
+	.loc	1 39 4
+	ret;
+$L__tmp14:
+$L__func_end0:
+}
+	.file	1 "/tmp/torchinductor_root/2q/c2qomesxoic3sfzpdzftrhej7z6hhd6pritis2f4ye2ckqoetmyt.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 333
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 50
+.b8 113
+.b8 111
+.b8 109
+.b8 101
+.b8 115
+.b8 120
+.b8 111
+.b8 105
+.b8 99
+.b8 51
+.b8 115
+.b8 102
+.b8 122
+.b8 112
+.b8 100
+.b8 122
+.b8 102
+.b8 116
+.b8 114
+.b8 104
+.b8 101
+.b8 106
+.b8 55
+.b8 122
+.b8 54
+.b8 104
+.b8 104
+.b8 100
+.b8 54
+.b8 112
+.b8 114
+.b8 105
+.b8 116
+.b8 105
+.b8 115
+.b8 50
+.b8 102
+.b8 52
+.b8 121
+.b8 101
+.b8 50
+.b8 99
+.b8 107
+.b8 113
+.b8 111
+.b8 101
+.b8 116
+.b8 109
+.b8 121
+.b8 116
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 50
+.b8 113
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 52
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 52
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp6
+.b8 2
+.b8 32
+.b8 24
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp7
+.b8 2
+.b8 32
+.b8 24
+.b8 4
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp7
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 4
+.b32 125
+.b64 $L__tmp7
+.b64 $L__tmp12
+.b8 2
+.b8 35
+.b8 24
+.b8 5
+.b32 125
+.b64 $L__tmp8
+.b64 $L__tmp13
+.b8 2
+.b8 35
+.b8 24
+.b8 4
+.b32 125
+.b64 $L__tmp8
+.b64 $L__tmp13
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 337
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 52
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 337
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}

.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,39 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d34e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg3: i32, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<0> : tensor<1x8xi64, #blocked>
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x8xf32, #blocked>
+    %cst_1 = arith.constant dense<8> : tensor<1x8xi32, #blocked>
+    %c0_i32 = arith.constant 0 : i32
+    %0 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
+    %1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked>
+    %2 = arith.cmpi slt, %1, %cst_1 : tensor<1x8xi32, #blocked>
+    %3 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1x8x!tt.ptr<f32, 1>, #blocked>
+    %4 = tt.addptr %3, %1 : tensor<1x8x!tt.ptr<f32, 1>, #blocked>, tensor<1x8xi32, #blocked>
+    %5 = tt.load %4, %2, %cst_0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x8xf32, #blocked>
+    %6 = tt.splat %arg2 : (!tt.ptr<i64, 1>) -> tensor<1x8x!tt.ptr<i64, 1>, #blocked>
+    %7 = tt.addptr %6, %1 : tensor<1x8x!tt.ptr<i64, 1>, #blocked>, tensor<1x8xi32, #blocked>
+    %8 = tt.load %7, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x8xi64, #blocked>
+    %9 = arith.select %2, %5, %cst_0 : tensor<1x8xi1, #blocked>, tensor<1x8xf32, #blocked>
+    %10 = "tt.reduce"(%9) <{axis = 1 : i32}> ({
+    ^bb0(%arg5: f32, %arg6: f32):
+      %19 = arith.addf %arg5, %arg6 : f32
+      tt.reduce.return %19 : f32
+    }) : (tensor<1x8xf32, #blocked>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %11 = tt.expand_dims %10 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xf32, #blocked>
+    %12 = arith.select %2, %8, %cst : tensor<1x8xi1, #blocked>, tensor<1x8xi64, #blocked>
+    %13 = "tt.reduce"(%12) <{axis = 1 : i32}> ({
+    ^bb0(%arg5: i64, %arg6: i64):
+      %19 = arith.addi %arg5, %arg6 : i64
+      tt.reduce.return %19 : i64
+    }) : (tensor<1x8xi64, #blocked>) -> tensor<1xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %14 = tt.expand_dims %13 {axis = 1 : i32} : (tensor<1xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xi64, #blocked>
+    %15 = arith.sitofp %14 : tensor<1x1xi64, #blocked> to tensor<1x1xf32, #blocked>
+    %16 = arith.divf %11, %15 : tensor<1x1xf32, #blocked>
+    gpu.barrier
+    %17 = tt.addptr %arg0, %c0_i32 : !tt.ptr<f32, 1>, i32
+    %18 = tt.splat %17 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>, #blocked>
+    tt.store %18, %16 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32, #blocked>
+    tt.return
+  }
+}

.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.ttir ADDED Viewed

	@@ -0,0 +1,38 @@

+module {
+  tt.func public @triton__0d1d2d34e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg3: i32, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
+    %c0_i32 = arith.constant 0 : i32
+    %cst = arith.constant dense<0> : tensor<1x8xi64>
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x8xf32>
+    %cst_1 = arith.constant dense<8> : tensor<1x8xi32>
+    %0 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
+    %1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
+    %2 = arith.cmpi slt, %1, %cst_1 : tensor<1x8xi32>
+    %3 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1x8x!tt.ptr<f32, 1>>
+    %4 = tt.addptr %3, %1 : tensor<1x8x!tt.ptr<f32, 1>>, tensor<1x8xi32>
+    %5 = tt.load %4, %2, %cst_0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x8xf32>
+    %6 = tt.splat %arg2 : (!tt.ptr<i64, 1>) -> tensor<1x8x!tt.ptr<i64, 1>>
+    %7 = tt.addptr %6, %1 : tensor<1x8x!tt.ptr<i64, 1>>, tensor<1x8xi32>
+    %8 = tt.load %7, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x8xi64>
+    %9 = arith.select %2, %5, %cst_0 : tensor<1x8xi1>, tensor<1x8xf32>
+    %10 = "tt.reduce"(%9) <{axis = 1 : i32}> ({
+    ^bb0(%arg5: f32, %arg6: f32):
+      %19 = arith.addf %arg5, %arg6 : f32
+      tt.reduce.return %19 : f32
+    }) : (tensor<1x8xf32>) -> tensor<1xf32>
+    %11 = tt.expand_dims %10 {axis = 1 : i32} : (tensor<1xf32>) -> tensor<1x1xf32>
+    %12 = arith.select %2, %8, %cst : tensor<1x8xi1>, tensor<1x8xi64>
+    %13 = "tt.reduce"(%12) <{axis = 1 : i32}> ({
+    ^bb0(%arg5: i64, %arg6: i64):
+      %19 = arith.addi %arg5, %arg6 : i64
+      tt.reduce.return %19 : i64
+    }) : (tensor<1x8xi64>) -> tensor<1xi64>
+    %14 = tt.expand_dims %13 {axis = 1 : i32} : (tensor<1xi64>) -> tensor<1x1xi64>
+    %15 = arith.sitofp %14 : tensor<1x1xi64> to tensor<1x1xf32>
+    %16 = arith.divf %11, %15 : tensor<1x1xf32>
+    gpu.barrier
+    %17 = tt.addptr %arg0, %c0_i32 : !tt.ptr<f32, 1>, i32
+    %18 = tt.splat %17 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>>
+    tt.store %18, %16 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32>
+    tt.return
+  }
+}

.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.cubin ADDED Viewed

Binary file (31.3 kB). View file

.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.llir ADDED Viewed

	@@ -0,0 +1,550 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
+@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
+@global_smem = external addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
+define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %10 = and i32 %9, 31, !dbg !10
+  %11 = lshr i32 %9, 5, !dbg !10
+  %12 = lshr i32 %9, 6, !dbg !10
+  %13 = and i32 %12, 1, !dbg !10
+  %14 = and i32 %9, 1, !dbg !10
+  %15 = and i32 %11, 1, !dbg !11
+  %urem = shl i32 %9, 2, !dbg !11
+  %16 = and i32 %urem, 252, !dbg !11
+  %17 = shl i32 %9, 1, !dbg !11
+  %18 = and i32 %17, 254, !dbg !11
+  %19 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
+  %20 = shl i32 %19, 1, !dbg !13
+  %21 = or i32 %20, %13, !dbg !14
+  %22 = or i32 %20, %14, !dbg !14
+  %23 = sext i32 %21 to i64, !dbg !15
+  %24 = getelementptr i64, ptr addrspace(1) %0, i64 %23, !dbg !15
+  %25 = sext i32 %22 to i64, !dbg !15
+  %26 = getelementptr i64, ptr addrspace(1) %0, i64 %25, !dbg !15
+  %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
+  %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
+  %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
+  %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
+  %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !16
+  %32 = srem i32 %21, 512, !dbg !17
+  %33 = shl nsw i32 %32, 8, !dbg !18
+  %34 = or i32 %33, %16, !dbg !19
+  %35 = sext i32 %34 to i64, !dbg !20
+  %36 = getelementptr float, ptr addrspace(1) %2, i64 %35, !dbg !20
+  %37 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %36, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
+  %38 = extractvalue { i32, i32, i32, i32 } %37, 0, !dbg !21
+  %39 = extractvalue { i32, i32, i32, i32 } %37, 1, !dbg !21
+  %40 = extractvalue { i32, i32, i32, i32 } %37, 2, !dbg !21
+  %41 = extractvalue { i32, i32, i32, i32 } %37, 3, !dbg !21
+  %42 = insertelement <2 x i32> poison, i32 %39, i64 0, !dbg !21
+  %43 = insertelement <2 x i32> %42, i32 %38, i64 1, !dbg !21
+  %44 = bitcast <2 x i32> %43 to <2 x float>, !dbg !21
+  %45 = bitcast i32 %40 to float, !dbg !21
+  %46 = bitcast i32 %41 to float, !dbg !21
+  %47 = shl i32 %21, 8, !dbg !22
+  %48 = or i32 %47, %16, !dbg !23
+  %49 = sext i32 %48 to i64, !dbg !24
+  %50 = getelementptr i16, ptr addrspace(1) %3, i64 %49, !dbg !24
+  %51 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !25
+  %52 = extractvalue { i32, i32 } %51, 0, !dbg !25
+  %53 = extractvalue { i32, i32 } %51, 1, !dbg !25
+  %54 = trunc i32 %52 to i16, !dbg !25
+  %extelt.offset = lshr i32 %52, 16, !dbg !25
+  %55 = trunc i32 %extelt.offset to i16, !dbg !25
+  %56 = trunc i32 %53 to i16, !dbg !25
+  %extelt.offset1 = lshr i32 %53, 16, !dbg !25
+  %57 = trunc i32 %extelt.offset1 to i16, !dbg !25
+  %58 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %54) #6, !dbg !26
+  %59 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %55) #6, !dbg !26
+  %60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %56) #6, !dbg !26
+  %61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %57) #6, !dbg !26
+  %62 = add i64 %31, 50257, !dbg !27
+  %63 = icmp slt i64 %27, 0, !dbg !28
+  %64 = icmp slt i64 %31, 0, !dbg !28
+  %65 = select i1 %64, i64 %62, i64 %31, !dbg !29
+  %66 = icmp ugt i64 %65, 50256, !dbg !30
+  br i1 %66, label %67, label %68, !dbg !31
+67:                                               ; preds = %8
+  tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !31
+  br label %68, !dbg !31
+68:                                               ; preds = %67, %8
+  %69 = shl i64 %27, 8, !dbg !32
+  %70 = add i64 %69, 12865792, !dbg !32
+  %71 = select i1 %63, i64 %70, i64 %69, !dbg !32
+  %72 = zext nneg i32 %16 to i64
+  %73 = or i64 %71, %72, !dbg !33
+  %74 = getelementptr float, ptr addrspace(1) %1, i64 %73, !dbg !34
+  %75 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %74, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
+  %76 = extractvalue { i32, i32, i32, i32 } %75, 0, !dbg !35
+  %77 = extractvalue { i32, i32, i32, i32 } %75, 1, !dbg !35
+  %78 = extractvalue { i32, i32, i32, i32 } %75, 2, !dbg !35
+  %79 = extractvalue { i32, i32, i32, i32 } %75, 3, !dbg !35
+  %80 = bitcast i32 %78 to float, !dbg !35
+  %81 = bitcast i32 %79 to float, !dbg !35
+  %82 = fadd float %45, %80, !dbg !36
+  %83 = fadd float %46, %81, !dbg !36
+  %84 = fadd float %60, %82, !dbg !37
+  %85 = fadd float %61, %83, !dbg !37
+  %86 = insertelement <2 x i32> poison, i32 %77, i64 0, !dbg !35
+  %87 = insertelement <2 x i32> %86, i32 %76, i64 1, !dbg !35
+  %88 = bitcast <2 x i32> %87 to <2 x float>, !dbg !35
+  %89 = fadd <2 x float> %44, %88, !dbg !36
+  %90 = insertelement <2 x float> poison, float %59, i64 0, !dbg !37
+  %91 = insertelement <2 x float> %90, float %58, i64 1, !dbg !37
+  %92 = fadd <2 x float> %91, %89, !dbg !37
+  %93 = fadd <2 x float> %92, zeroinitializer, !dbg !38
+  %94 = fadd float %84, 0.000000e+00, !dbg !38
+  %95 = fadd float %85, 0.000000e+00, !dbg !38
+  %96 = extractelement <2 x float> %93, i64 1, !dbg !42
+  %97 = extractelement <2 x float> %92, i64 1, !dbg !46
+  %98 = fsub float %97, %96, !dbg !47
+  %99 = extractelement <2 x float> %93, i64 0, !dbg !42
+  %100 = extractelement <2 x float> %92, i64 0, !dbg !46
+  %101 = fsub float %100, %99, !dbg !47
+  %102 = fsub float %84, %94, !dbg !47
+  %103 = fsub float %85, %95, !dbg !47
+  %104 = fmul float %97, %98, !dbg !46
+  %105 = fmul float %100, %101, !dbg !46
+  %106 = fmul float %84, %102, !dbg !46
+  %107 = fmul float %85, %103, !dbg !46
+  %108 = fadd float %104, 0.000000e+00, !dbg !48
+  %109 = fadd float %105, 0.000000e+00, !dbg !48
+  %110 = fadd float %106, 0.000000e+00, !dbg !48
+  %111 = fadd float %107, 0.000000e+00, !dbg !48
+  %112 = fsub float %99, %96, !dbg !42
+  %113 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !49
+  %114 = fmul float %113, %112, !dbg !50
+  %115 = fadd float %96, %114, !dbg !51
+  %116 = fadd float %108, %109, !dbg !52
+  %117 = fmul float %112, %112, !dbg !53
+  %118 = fmul float %113, %117, !dbg !54
+  %119 = fadd float %118, %116, !dbg !55
+  %120 = fsub float %94, %115, !dbg !42
+  %121 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !49
+  %122 = fmul float %121, %120, !dbg !50
+  %123 = fadd float %115, %122, !dbg !51
+  %124 = fadd float %110, %119, !dbg !52
+  %125 = fmul float %120, %120, !dbg !53
+  %126 = fmul float %125, 2.000000e+00, !dbg !56
+  %127 = fmul float %121, %126, !dbg !54
+  %128 = fadd float %124, %127, !dbg !55
+  %129 = fsub float %95, %123, !dbg !42
+  %130 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !49
+  %131 = fmul float %130, %129, !dbg !50
+  %132 = fadd float %123, %131, !dbg !51
+  %133 = fadd float %111, %128, !dbg !52
+  %134 = fmul float %129, %129, !dbg !53
+  %135 = fmul float %134, 3.000000e+00, !dbg !56
+  %136 = fmul float %130, %135, !dbg !54
+  %137 = fadd float %133, %136, !dbg !55
+  %138 = bitcast float %132 to i32, !dbg !57
+  %139 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %138, i32 16, i32 31), !dbg !57
+  %140 = bitcast i32 %139 to float, !dbg !57
+  %141 = bitcast float %137 to i32, !dbg !57
+  %142 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %141, i32 16, i32 31), !dbg !57
+  %143 = bitcast i32 %142 to float, !dbg !57
+  %144 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1082130432, i32 16, i32 31), !dbg !57
+  %145 = bitcast i32 %144 to float, !dbg !57
+  %146 = fsub float %140, %132, !dbg !42
+  %147 = fadd float %145, 4.000000e+00, !dbg !59
+  %148 = fcmp oeq float %147, 0.000000e+00, !dbg !60
+  %149 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %145, float %147) #6, !dbg !49
+  %150 = select i1 %148, float 0.000000e+00, float %149, !dbg !61
+  %151 = fmul float %150, %146, !dbg !50
+  %152 = fadd float %132, %151, !dbg !51
+  %153 = fadd float %137, %143, !dbg !52
+  %154 = fmul float %146, %146, !dbg !53
+  %155 = fmul float %154, 4.000000e+00, !dbg !56
+  %156 = fmul float %150, %155, !dbg !54
+  %157 = fadd float %153, %156, !dbg !55
+  %158 = bitcast float %152 to i32, !dbg !57
+  %159 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %158, i32 8, i32 31), !dbg !57
+  %160 = bitcast i32 %159 to float, !dbg !57
+  %161 = bitcast float %157 to i32, !dbg !57
+  %162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 8, i32 31), !dbg !57
+  %163 = bitcast i32 %162 to float, !dbg !57
+  %164 = bitcast float %147 to i32, !dbg !57
+  %165 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %164, i32 8, i32 31), !dbg !57
+  %166 = bitcast i32 %165 to float, !dbg !57
+  %167 = fsub float %160, %152, !dbg !42
+  %168 = fadd float %147, %166, !dbg !59
+  %169 = fcmp oeq float %168, 0.000000e+00, !dbg !60
+  %170 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %166, float %168) #6, !dbg !49
+  %171 = select i1 %169, float 0.000000e+00, float %170, !dbg !61
+  %172 = fmul float %171, %167, !dbg !50
+  %173 = fadd float %152, %172, !dbg !51
+  %174 = fadd float %157, %163, !dbg !52
+  %175 = fmul float %167, %167, !dbg !53
+  %176 = fmul float %147, %175, !dbg !56
+  %177 = fmul float %171, %176, !dbg !54
+  %178 = fadd float %174, %177, !dbg !55
+  %179 = bitcast float %173 to i32, !dbg !57
+  %180 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %179, i32 4, i32 31), !dbg !57
+  %181 = bitcast i32 %180 to float, !dbg !57
+  %182 = bitcast float %178 to i32, !dbg !57
+  %183 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %182, i32 4, i32 31), !dbg !57
+  %184 = bitcast i32 %183 to float, !dbg !57
+  %185 = bitcast float %168 to i32, !dbg !57
+  %186 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %185, i32 4, i32 31), !dbg !57
+  %187 = bitcast i32 %186 to float, !dbg !57
+  %188 = fsub float %181, %173, !dbg !42
+  %189 = fadd float %168, %187, !dbg !59
+  %190 = fcmp oeq float %189, 0.000000e+00, !dbg !60
+  %191 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %187, float %189) #6, !dbg !49
+  %192 = select i1 %190, float 0.000000e+00, float %191, !dbg !61
+  %193 = fmul float %192, %188, !dbg !50
+  %194 = fadd float %173, %193, !dbg !51
+  %195 = fadd float %178, %184, !dbg !52
+  %196 = fmul float %188, %188, !dbg !53
+  %197 = fmul float %168, %196, !dbg !56
+  %198 = fmul float %192, %197, !dbg !54
+  %199 = fadd float %195, %198, !dbg !55
+  %200 = bitcast float %194 to i32, !dbg !57
+  %201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 2, i32 31), !dbg !57
+  %202 = bitcast i32 %201 to float, !dbg !57
+  %203 = bitcast float %199 to i32, !dbg !57
+  %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 2, i32 31), !dbg !57
+  %205 = bitcast i32 %204 to float, !dbg !57
+  %206 = bitcast float %189 to i32, !dbg !57
+  %207 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %206, i32 2, i32 31), !dbg !57
+  %208 = bitcast i32 %207 to float, !dbg !57
+  %209 = fsub float %202, %194, !dbg !42
+  %210 = fadd float %189, %208, !dbg !59
+  %211 = fcmp oeq float %210, 0.000000e+00, !dbg !60
+  %212 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %208, float %210) #6, !dbg !49
+  %213 = select i1 %211, float 0.000000e+00, float %212, !dbg !61
+  %214 = fmul float %213, %209, !dbg !50
+  %215 = fadd float %194, %214, !dbg !51
+  %216 = fadd float %199, %205, !dbg !52
+  %217 = fmul float %209, %209, !dbg !53
+  %218 = fmul float %189, %217, !dbg !56
+  %219 = fmul float %213, %218, !dbg !54
+  %220 = fadd float %216, %219, !dbg !55
+  %221 = bitcast float %215 to i32, !dbg !57
+  %222 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %221, i32 1, i32 31), !dbg !57
+  %223 = bitcast i32 %222 to float, !dbg !57
+  %224 = bitcast float %220 to i32, !dbg !57
+  %225 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %224, i32 1, i32 31), !dbg !57
+  %226 = bitcast i32 %225 to float, !dbg !57
+  %227 = bitcast float %210 to i32, !dbg !57
+  %228 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %227, i32 1, i32 31), !dbg !57
+  %229 = bitcast i32 %228 to float, !dbg !57
+  %230 = fsub float %223, %215, !dbg !42
+  %231 = fadd float %210, %229, !dbg !59
+  %232 = fcmp oeq float %231, 0.000000e+00, !dbg !60
+  %233 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %229, float %231) #6, !dbg !49
+  %234 = select i1 %232, float 0.000000e+00, float %233, !dbg !61
+  %235 = fmul float %234, %230, !dbg !50
+  %236 = fadd float %215, %235, !dbg !51
+  %237 = fadd float %220, %226, !dbg !52
+  %238 = fmul float %230, %230, !dbg !53
+  %239 = fmul float %210, %238, !dbg !56
+  %240 = fmul float %234, %239, !dbg !54
+  %241 = fadd float %237, %240, !dbg !55
+  %242 = icmp eq i32 %10, 0, !dbg !57
+  %243 = shl nuw nsw i32 %13, 1, !dbg !57
+  %244 = or i32 %243, %15, !dbg !57
+  %245 = zext nneg i32 %244 to i64, !dbg !57
+  %246 = getelementptr float, ptr addrspace(3) @global_smem, i64 %245, !dbg !57
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %246, float %236, i1 %242) #6, !dbg !57
+  %247 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %245, !dbg !57
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %247, float %241, i1 %242) #6, !dbg !57
+  %248 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %245, !dbg !57
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %248, float %231, i1 %242) #6, !dbg !57
+  tail call void @llvm.nvvm.barrier0(), !dbg !57
+  %249 = icmp slt i32 %9, 4, !dbg !57
+  %250 = sext i32 %9 to i64, !dbg !57
+  %251 = getelementptr float, ptr addrspace(3) @global_smem, i64 %250, !dbg !57
+  %252 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %251, i1 %249) #6, !dbg !57
+  %253 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %250, !dbg !57
+  %254 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %253, i1 %249) #6, !dbg !57
+  %255 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %250, !dbg !57
+  %256 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %255, i1 %249) #6, !dbg !57
+  %257 = bitcast float %252 to i32, !dbg !57
+  %258 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %257, i32 1, i32 31), !dbg !57
+  %259 = bitcast i32 %258 to float, !dbg !57
+  %260 = bitcast float %254 to i32, !dbg !57
+  %261 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %260, i32 1, i32 31), !dbg !57
+  %262 = bitcast i32 %261 to float, !dbg !57
+  %263 = bitcast float %256 to i32, !dbg !57
+  %264 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %263, i32 1, i32 31), !dbg !57
+  %265 = bitcast i32 %264 to float, !dbg !57
+  %266 = fsub float %259, %252, !dbg !42
+  %267 = fadd float %256, %265, !dbg !59
+  %268 = fcmp oeq float %267, 0.000000e+00, !dbg !60
+  %269 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %265, float %267) #6, !dbg !49
+  %270 = select i1 %268, float 0.000000e+00, float %269, !dbg !61
+  %271 = fmul float %266, %270, !dbg !50
+  %272 = fadd float %252, %271, !dbg !51
+  %273 = fadd float %254, %262, !dbg !52
+  %274 = fmul float %266, %266, !dbg !53
+  %275 = fmul float %256, %274, !dbg !56
+  %276 = fmul float %275, %270, !dbg !54
+  %277 = fadd float %273, %276, !dbg !55
+  %278 = icmp eq i32 %14, 0, !dbg !57
+  %279 = and i1 %249, %278, !dbg !57
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %251, float %272, i1 %279) #6, !dbg !57
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %253, float %277, i1 %279) #6, !dbg !57
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %255, float %267, i1 %279) #6, !dbg !57
+  tail call void @llvm.nvvm.barrier0(), !dbg !57
+  %280 = zext nneg i32 %243 to i64, !dbg !57
+  %281 = getelementptr float, ptr addrspace(3) @global_smem, i64 %280, !dbg !57
+  %282 = load float, ptr addrspace(3) %281, align 4, !dbg !57
+  %283 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %280, !dbg !57
+  %284 = load float, ptr addrspace(3) %283, align 4, !dbg !57
+  %285 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %36, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62
+  %286 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !63
+  %287 = extractvalue { i32, i32 } %286, 0, !dbg !63
+  %288 = extractvalue { i32, i32 } %286, 1, !dbg !63
+  %289 = trunc i32 %287 to i16, !dbg !63
+  %extelt.offset2 = lshr i32 %287, 16, !dbg !63
+  %290 = trunc i32 %extelt.offset2 to i16, !dbg !63
+  %291 = trunc i32 %288 to i16, !dbg !63
+  %extelt.offset3 = lshr i32 %288, 16, !dbg !63
+  %292 = trunc i32 %extelt.offset3 to i16, !dbg !63
+  %293 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %289) #6, !dbg !64
+  %294 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %290) #6, !dbg !64
+  %295 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %291) #6, !dbg !64
+  %296 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %292) #6, !dbg !64
+  %297 = zext nneg i32 %18 to i64, !dbg !65
+  %298 = getelementptr float, ptr addrspace(1) %4, i64 %297, !dbg !65
+  %299 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %298, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !66
+  br i1 %66, label %300, label %301, !dbg !67
+300:                                              ; preds = %68
+  tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !67
+  br label %301, !dbg !67
+301:                                              ; preds = %300, %68
+  %302 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %74, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
+  %303 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %284, float 2.560000e+02) #6, !dbg !69
+  %304 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %284, float 2.560000e+02) #6, !dbg !69
+  %305 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %284, float 2.560000e+02) #6, !dbg !69
+  %306 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %284, float 2.560000e+02) #6, !dbg !69
+  %307 = fadd float %303, 0x3EE4F8B580000000, !dbg !70
+  %308 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
+  %.not.i = icmp eq i32 %308, 0, !dbg !71
+  br i1 %.not.i, label %311, label %309, !dbg !71
+309:                                              ; preds = %301
+  %310 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %307), !dbg !71
+  br label %__nv_rsqrtf.exit, !dbg !71
+311:                                              ; preds = %301
+  %312 = tail call float @llvm.nvvm.rsqrt.approx.f(float %307), !dbg !71
+  br label %__nv_rsqrtf.exit, !dbg !71
+__nv_rsqrtf.exit:                                 ; preds = %309, %311
+  %.0.i = phi float [ %310, %309 ], [ %312, %311 ], !dbg !71
+  %313 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
+  %314 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
+  %315 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
+  %316 = extractvalue { i32, i32, i32, i32 } %302, 3, !dbg !68
+  %317 = bitcast i32 %316 to float, !dbg !68
+  %318 = extractvalue { i32, i32, i32, i32 } %285, 3, !dbg !62
+  %319 = bitcast i32 %318 to float, !dbg !62
+  %320 = fadd float %319, %317, !dbg !72
+  %321 = fadd float %296, %320, !dbg !73
+  %322 = fsub float %321, %282, !dbg !74
+  %323 = extractvalue { i32, i32, i32, i32 } %302, 2, !dbg !68
+  %324 = bitcast i32 %323 to float, !dbg !68
+  %325 = extractvalue { i32, i32, i32, i32 } %285, 2, !dbg !62
+  %326 = bitcast i32 %325 to float, !dbg !62
+  %327 = fadd float %326, %324, !dbg !72
+  %328 = fadd float %295, %327, !dbg !73
+  %329 = fsub float %328, %282, !dbg !74
+  %330 = extractvalue { i32, i32, i32, i32 } %302, 1, !dbg !68
+  %331 = bitcast i32 %330 to float, !dbg !68
+  %332 = extractvalue { i32, i32, i32, i32 } %285, 1, !dbg !62
+  %333 = bitcast i32 %332 to float, !dbg !62
+  %334 = fadd float %333, %331, !dbg !72
+  %335 = fadd float %294, %334, !dbg !73
+  %336 = fsub float %335, %282, !dbg !74
+  %337 = extractvalue { i32, i32, i32, i32 } %302, 0, !dbg !68
+  %338 = bitcast i32 %337 to float, !dbg !68
+  %339 = extractvalue { i32, i32, i32, i32 } %285, 0, !dbg !62
+  %340 = bitcast i32 %339 to float, !dbg !62
+  %341 = fadd float %340, %338, !dbg !72
+  %342 = fadd float %293, %341, !dbg !73
+  %343 = fsub float %342, %282, !dbg !74
+  %344 = extractvalue { i32, i32 } %299, 0, !dbg !66
+  %345 = extractvalue { i32, i32 } %299, 1, !dbg !66
+  %346 = fmul float %343, %.0.i, !dbg !75
+  %347 = fmul float %336, %.0.i, !dbg !75
+  %348 = fmul float %329, %.0.i, !dbg !75
+  %349 = fmul float %322, %.0.i, !dbg !75
+  tail call void @llvm.nvvm.barrier0(), !dbg !76
+  %350 = getelementptr float, ptr addrspace(3) @global_smem, i64 %297, !dbg !76
+  %351 = insertelement <2 x i32> undef, i32 %344, i64 0, !dbg !76
+  %352 = insertelement <2 x i32> %351, i32 %345, i64 1, !dbg !76
+  store <2 x i32> %352, ptr addrspace(3) %350, align 8, !dbg !76
+  tail call void @llvm.nvvm.barrier0(), !dbg !76
+  %353 = getelementptr float, ptr addrspace(3) @global_smem, i64 %72, !dbg !76
+  %354 = load float, ptr addrspace(3) %353, align 16, !dbg !76
+  %355 = getelementptr inbounds <4 x float>, ptr addrspace(3) %353, i64 0, i64 1, !dbg !76
+  %356 = load float, ptr addrspace(3) %355, align 4, !dbg !76
+  %357 = getelementptr inbounds <4 x float>, ptr addrspace(3) %353, i64 0, i64 2, !dbg !76
+  %358 = load float, ptr addrspace(3) %357, align 8, !dbg !76
+  %359 = getelementptr inbounds <4 x float>, ptr addrspace(3) %353, i64 0, i64 3, !dbg !76
+  %360 = load float, ptr addrspace(3) %359, align 4, !dbg !76
+  %361 = fmul float %346, %354, !dbg !76
+  %362 = fmul float %347, %356, !dbg !76
+  %363 = fmul float %348, %358, !dbg !76
+  %364 = fmul float %349, %360, !dbg !76
+  %365 = getelementptr i16, ptr addrspace(1) %5, i64 %49, !dbg !77
+  %366 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %361) #6, !dbg !78
+  %367 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %362) #6, !dbg !78
+  %368 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %363) #6, !dbg !78
+  %369 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %364) #6, !dbg !78
+  %370 = insertelement <2 x i16> undef, i16 %366, i64 0, !dbg !78
+  %371 = insertelement <2 x i16> %370, i16 %367, i64 1, !dbg !78
+  %372 = bitcast <2 x i16> %371 to i32, !dbg !78
+  %373 = insertelement <2 x i16> undef, i16 %368, i64 0, !dbg !78
+  %374 = insertelement <2 x i16> %373, i16 %369, i64 1, !dbg !78
+  %375 = bitcast <2 x i16> %374 to i32, !dbg !78
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %372, i32 %375, ptr addrspace(1) %365, i1 true) #6, !dbg !78
+  ret void, !dbg !79
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "ccig6fki6p4lxrdmgg6eudahiexcvueeol2p4qp532pvve2y463y.py", directory: "/tmp/torchinductor_root/ci")
+!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 128}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 22, column: 44, scope: !7)
+!11 = !DILocation(line: 24, column: 33, scope: !7)
+!12 = !DILocation(line: 21, column: 28, scope: !7)
+!13 = !DILocation(line: 21, column: 33, scope: !7)
+!14 = !DILocation(line: 22, column: 23, scope: !7)
+!15 = !DILocation(line: 26, column: 30, scope: !7)
+!16 = !DILocation(line: 26, column: 35, scope: !7)
+!17 = !DILocation(line: 27, column: 18, scope: !7)
+!18 = !DILocation(line: 35, column: 44, scope: !7)
+!19 = !DILocation(line: 35, column: 40, scope: !7)
+!20 = !DILocation(line: 35, column: 34, scope: !7)
+!21 = !DILocation(line: 35, column: 50, scope: !7)
+!22 = !DILocation(line: 36, column: 44, scope: !7)
+!23 = !DILocation(line: 36, column: 40, scope: !7)
+!24 = !DILocation(line: 36, column: 34, scope: !7)
+!25 = !DILocation(line: 36, column: 50, scope: !7)
+!26 = !DILocation(line: 36, column: 101, scope: !7)
+!27 = !DILocation(line: 37, column: 22, scope: !7)
+!28 = !DILocation(line: 38, column: 22, scope: !7)
+!29 = !DILocation(line: 39, column: 36, scope: !7)
+!30 = !DILocation(line: 40, column: 40, scope: !7)
+!31 = !DILocation(line: 40, column: 55, scope: !7)
+!32 = !DILocation(line: 41, column: 44, scope: !7)
+!33 = !DILocation(line: 41, column: 40, scope: !7)
+!34 = !DILocation(line: 41, column: 34, scope: !7)
+!35 = !DILocation(line: 41, column: 52, scope: !7)
+!36 = !DILocation(line: 42, column: 22, scope: !7)
+!37 = !DILocation(line: 44, column: 22, scope: !7)
+!38 = !DILocation(line: 98, column: 22, scope: !39, inlinedAt: !41)
+!39 = distinct !DILexicalBlockFile(scope: !7, file: !40, discriminator: 0)
+!40 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!41 = !DILocation(line: 47, column: 41, scope: !39)
+!42 = !DILocation(line: 108, column: 21, scope: !43, inlinedAt: !44)
+!43 = distinct !DILexicalBlockFile(scope: !39, file: !40, discriminator: 0)
+!44 = !DILocation(line: 120, column: 46, scope: !43, inlinedAt: !45)
+!45 = !DILocation(line: 53, column: 44, scope: !43)
+!46 = !DILocation(line: 101, column: 22, scope: !39, inlinedAt: !41)
+!47 = !DILocation(line: 101, column: 30, scope: !39, inlinedAt: !41)
+!48 = !DILocation(line: 101, column: 13, scope: !39, inlinedAt: !41)
+!49 = !DILocation(line: 110, column: 60, scope: !43, inlinedAt: !44)
+!50 = !DILocation(line: 112, column: 25, scope: !43, inlinedAt: !44)
+!51 = !DILocation(line: 112, column: 17, scope: !43, inlinedAt: !44)
+!52 = !DILocation(line: 113, column: 15, scope: !43, inlinedAt: !44)
+!53 = !DILocation(line: 113, column: 30, scope: !43, inlinedAt: !44)
+!54 = !DILocation(line: 113, column: 49, scope: !43, inlinedAt: !44)
+!55 = !DILocation(line: 113, column: 22, scope: !43, inlinedAt: !44)
+!56 = !DILocation(line: 113, column: 38, scope: !43, inlinedAt: !44)
+!57 = !DILocation(line: 120, column: 46, scope: !39, inlinedAt: !58)
+!58 = !DILocation(line: 53, column: 44, scope: !39)
+!59 = !DILocation(line: 109, column: 28, scope: !43, inlinedAt: !44)
+!60 = !DILocation(line: 110, column: 39, scope: !43, inlinedAt: !44)
+!61 = !DILocation(line: 110, column: 49, scope: !43, inlinedAt: !44)
+!62 = !DILocation(line: 62, column: 51, scope: !7)
+!63 = !DILocation(line: 63, column: 51, scope: !7)
+!64 = !DILocation(line: 63, column: 103, scope: !7)
+!65 = !DILocation(line: 64, column: 35, scope: !7)
+!66 = !DILocation(line: 64, column: 40, scope: !7)
+!67 = !DILocation(line: 68, column: 57, scope: !7)
+!68 = !DILocation(line: 69, column: 54, scope: !7)
+!69 = !DILocation(line: 75, column: 24, scope: !7)
+!70 = !DILocation(line: 77, column: 24, scope: !7)
+!71 = !DILocation(line: 78, column: 30, scope: !7)
+!72 = !DILocation(line: 70, column: 24, scope: !7)
+!73 = !DILocation(line: 72, column: 24, scope: !7)
+!74 = !DILocation(line: 73, column: 24, scope: !7)
+!75 = !DILocation(line: 79, column: 24, scope: !7)
+!76 = !DILocation(line: 80, column: 24, scope: !7)
+!77 = !DILocation(line: 82, column: 29, scope: !7)
+!78 = !DILocation(line: 82, column: 52, scope: !7)
+!79 = !DILocation(line: 58, column: 4, scope: !7)

.triton/dump/4c6ad48573c74d55ed79384f6b432d50/triton_.cubin ADDED Viewed

Binary file (5.54 kB). View file

.triton/dump/4c6ad48573c74d55ed79384f6b432d50/triton_.llir ADDED Viewed

	@@ -0,0 +1,85 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
+  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %5 = shl i32 %4, 3, !dbg !8
+  %6 = and i32 %5, 1016, !dbg !8
+  %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
+  %8 = shl i32 %7, 10, !dbg !10
+  %9 = or i32 %8, %6, !dbg !11
+  %10 = or i32 %9, 4, !dbg !11
+  %11 = sext i32 %9 to i64, !dbg !12
+  %12 = getelementptr float, ptr addrspace(1) %0, i64 %11, !dbg !12
+  %13 = sext i32 %10 to i64, !dbg !12
+  %14 = getelementptr float, ptr addrspace(1) %0, i64 %13, !dbg !12
+  %15 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %12, i1 true) #1, !dbg !13
+  %16 = extractvalue { i32, i32, i32, i32 } %15, 0, !dbg !13
+  %17 = extractvalue { i32, i32, i32, i32 } %15, 1, !dbg !13
+  %18 = extractvalue { i32, i32, i32, i32 } %15, 2, !dbg !13
+  %19 = extractvalue { i32, i32, i32, i32 } %15, 3, !dbg !13
+  %20 = bitcast i32 %16 to float, !dbg !13
+  %21 = bitcast i32 %17 to float, !dbg !13
+  %22 = bitcast i32 %18 to float, !dbg !13
+  %23 = bitcast i32 %19 to float, !dbg !13
+  %24 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %14, i1 true) #1, !dbg !13
+  %25 = extractvalue { i32, i32, i32, i32 } %24, 0, !dbg !13
+  %26 = extractvalue { i32, i32, i32, i32 } %24, 1, !dbg !13
+  %27 = extractvalue { i32, i32, i32, i32 } %24, 2, !dbg !13
+  %28 = extractvalue { i32, i32, i32, i32 } %24, 3, !dbg !13
+  %29 = bitcast i32 %25 to float, !dbg !13
+  %30 = bitcast i32 %26 to float, !dbg !13
+  %31 = bitcast i32 %27 to float, !dbg !13
+  %32 = bitcast i32 %28 to float, !dbg !13
+  %33 = getelementptr i16, ptr addrspace(1) %1, i64 %11, !dbg !14
+  %34 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %20) #1, !dbg !15
+  %35 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %21) #1, !dbg !15
+  %36 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %22) #1, !dbg !15
+  %37 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %23) #1, !dbg !15
+  %38 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %29) #1, !dbg !15
+  %39 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %30) #1, !dbg !15
+  %40 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %31) #1, !dbg !15
+  %41 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %32) #1, !dbg !15
+  %42 = insertelement <2 x i16> undef, i16 %34, i64 0, !dbg !15
+  %43 = insertelement <2 x i16> %42, i16 %35, i64 1, !dbg !15
+  %44 = bitcast <2 x i16> %43 to i32, !dbg !15
+  %45 = insertelement <2 x i16> undef, i16 %36, i64 0, !dbg !15
+  %46 = insertelement <2 x i16> %45, i16 %37, i64 1, !dbg !15
+  %47 = bitcast <2 x i16> %46 to i32, !dbg !15
+  %48 = insertelement <2 x i16> undef, i16 %38, i64 0, !dbg !15
+  %49 = insertelement <2 x i16> %48, i16 %39, i64 1, !dbg !15
+  %50 = bitcast <2 x i16> %49 to i32, !dbg !15
+  %51 = insertelement <2 x i16> undef, i16 %40, i64 0, !dbg !15
+  %52 = insertelement <2 x i16> %51, i16 %41, i64 1, !dbg !15
+  %53 = bitcast <2 x i16> %52 to i32, !dbg !15
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %44, i32 %47, i32 %50, i32 %53, ptr addrspace(1) %33, i1 true) #1, !dbg !15
+  ret void, !dbg !16
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { nounwind }
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "czjxjqxojsyyr4zmce6q6twysnucw6p4l5ujgp6ts2ecrm3ue3ex.py", directory: "/tmp/torchinductor_root/zj")
+!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
+!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 21, column: 36, scope: !5)
+!9 = !DILocation(line: 20, column: 28, scope: !5)
+!10 = !DILocation(line: 20, column: 33, scope: !5)
+!11 = !DILocation(line: 21, column: 23, scope: !5)
+!12 = !DILocation(line: 24, column: 30, scope: !5)
+!13 = !DILocation(line: 24, column: 35, scope: !5)
+!14 = !DILocation(line: 26, column: 25, scope: !5)
+!15 = !DILocation(line: 26, column: 36, scope: !5)
+!16 = !DILocation(line: 26, column: 4, scope: !5)

.triton/dump/510522bb05917b836ed253751364fcad/triton_.cubin ADDED Viewed

Binary file (66.2 kB). View file

.triton/dump/510522bb05917b836ed253751364fcad/triton_.ptx ADDED Viewed

	@@ -0,0 +1,1810 @@

+//
+// Generated by LLVM NVPTX Back-End
+//
+.version 8.2
+.target sm_89
+.address_size 64
+	// .globl	triton__0d1d2d3d4d5de6de
+.extern .func __assertfail
+(
+	.param .b64 __assertfail_param_0,
+	.param .b64 __assertfail_param_1,
+	.param .b32 __assertfail_param_2,
+	.param .b64 __assertfail_param_3,
+	.param .b64 __assertfail_param_4
+)
+;
+.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 51, 32, 60, 32, 53, 48, 50, 53, 55};
+.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
+.extern .shared .align 1 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+.visible .entry triton__0d1d2d3d4d5de6de(
+	.param .u64 triton__0d1d2d3d4d5de6de_param_0,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_1,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_2,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_3,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_4,
+	.param .u32 triton__0d1d2d3d4d5de6de_param_5,
+	.param .u32 triton__0d1d2d3d4d5de6de_param_6
+)
+.maxntid 256, 1, 1
+{
+	.reg .pred 	%p<137>;
+	.reg .b16 	%rs<17>;
+	.reg .b32 	%r<408>;
+	.reg .f32 	%f<614>;
+	.reg .b64 	%rd<107>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+	ld.param.u64 	%rd13, [triton__0d1d2d3d4d5de6de_param_4];
+	ld.param.u64 	%rd12, [triton__0d1d2d3d4d5de6de_param_3];
+	ld.param.u64 	%rd49, [triton__0d1d2d3d4d5de6de_param_0];
+	ld.param.u64 	%rd50, [triton__0d1d2d3d4d5de6de_param_1];
+$L__tmp0:
+	.loc	1 22 44
+	mov.u32 	%r13, %tid.x;
+	ld.param.u64 	%rd51, [triton__0d1d2d3d4d5de6de_param_2];
+	bfe.u32 	%r1, %r13, 3, 5;
+	and.b32  	%r2, %r13, 63;
+	.loc	1 24 33
+	shl.b32 	%r14, %r13, 3;
+	and.b32  	%r3, %r14, 56;
+	.loc	1 31 36
+	shr.u32 	%r4, %r13, 6;
+	.loc	1 21 28
+	mov.u32 %r11, %ctaid.x;
+	.loc	1 21 33
+	shl.b32 	%r15, %r11, 6;
+	.loc	1 22 23
+	or.b32  	%r16, %r15, %r1;
+	or.b32  	%r17, %r16, 32;
+	or.b32  	%r18, %r15, %r2;
+	.loc	1 26 30
+	mul.wide.s32 	%rd52, %r16, 8;
+	add.s64 	%rd15, %rd49, %rd52;
+	add.s64 	%rd31, %rd15, 256;
+	mul.wide.s32 	%rd53, %r18, 8;
+	add.s64 	%rd47, %rd49, %rd53;
+	mov.pred 	%p1, -1;
+	.loc	1 26 35
+	mov.u64 %rd14, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd14 }, [ %rd15 + 0 ];
+	mov.u64 %rd16, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd15 + 0 ];
+	mov.u64 %rd18, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd15 + 0 ];
+	mov.u64 %rd20, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd20 }, [ %rd15 + 0 ];
+	mov.u64 %rd22, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd22 }, [ %rd15 + 0 ];
+	mov.u64 %rd24, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd24 }, [ %rd15 + 0 ];
+	mov.u64 %rd26, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd26 }, [ %rd15 + 0 ];
+	mov.u64 %rd28, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd28 }, [ %rd15 + 0 ];
+	mov.u64 %rd30, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd30 }, [ %rd31 + 0 ];
+	mov.u64 %rd32, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd32 }, [ %rd31 + 0 ];
+	mov.u64 %rd34, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd34 }, [ %rd31 + 0 ];
+	mov.u64 %rd36, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd36 }, [ %rd31 + 0 ];
+	mov.u64 %rd38, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd38 }, [ %rd31 + 0 ];
+	mov.u64 %rd40, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd40 }, [ %rd31 + 0 ];
+	mov.u64 %rd42, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd42 }, [ %rd31 + 0 ];
+	mov.u64 %rd44, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd44 }, [ %rd31 + 0 ];
+	mov.u64 %rd46, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd46 }, [ %rd47 + 0 ];
+	.loc	1 27 18
+	bfe.s32 	%r19, %r11, 25, 1;
+	shr.u32 	%r20, %r19, 23;
+	add.s32 	%r21, %r16, %r20;
+	and.b32  	%r22, %r21, 16776704;
+	sub.s32 	%r23, %r16, %r22;
+	add.s32 	%r24, %r17, %r20;
+	and.b32  	%r25, %r24, 16776704;
+	sub.s32 	%r26, %r17, %r25;
+	.loc	1 35 44
+	shl.b32 	%r27, %r23, 8;
+	shl.b32 	%r28, %r26, 8;
+	.loc	1 36 22
+	add.s64 	%rd54, %rd46, 50257;
+	.loc	1 37 22
+	setp.lt.s64 	%p18, %rd14, 0;
+	setp.lt.s64 	%p19, %rd30, 0;
+	setp.lt.s64 	%p20, %rd46, 0;
+	.loc	1 38 36
+	selp.b64 	%rd1, %rd54, %rd46, %p20;
+	.loc	1 40 44
+	shl.b64 	%rd55, %rd14, 8;
+	add.s64 	%rd56, %rd55, 12865792;
+	selp.b64 	%rd57, %rd56, %rd55, %p18;
+	shl.b64 	%rd58, %rd30, 8;
+	add.s64 	%rd59, %rd58, 12865792;
+	selp.b64 	%rd60, %rd59, %rd58, %p19;
+	.loc	1 31 36
+	and.b32  	%r29, %r13, 7;
+	mul.wide.u32 	%rd2, %r29, 32;
+	shl.b64 	%rd61, %rd60, 2;
+	or.b64  	%rd62, %rd2, %rd61;
+	add.s64 	%rd3, %rd50, %rd62;
+	shl.b64 	%rd63, %rd57, 2;
+	or.b64  	%rd64, %rd2, %rd63;
+	add.s64 	%rd4, %rd50, %rd64;
+	or.b32  	%r30, %r28, %r3;
+	mul.wide.s32 	%rd65, %r30, 4;
+	add.s64 	%rd5, %rd51, %rd65;
+	or.b32  	%r31, %r27, %r3;
+	mul.wide.s32 	%rd66, %r31, 4;
+	add.s64 	%rd6, %rd51, %rd66;
+	mov.f32 	%f550, 0f00000000;
+	mov.u64 	%rd105, 0;
+	mov.b32 	%r406, -64;
+	mov.f32 	%f551, %f550;
+	mov.f32 	%f552, %f550;
+	mov.f32 	%f553, %f550;
+	mov.f32 	%f554, %f550;
+	mov.f32 	%f555, %f550;
+	mov.f32 	%f556, %f550;
+	mov.f32 	%f557, %f550;
+	mov.f32 	%f558, %f550;
+	mov.f32 	%f559, %f550;
+	mov.f32 	%f560, %f550;
+	mov.f32 	%f561, %f550;
+	mov.f32 	%f562, %f550;
+	mov.f32 	%f563, %f550;
+	mov.f32 	%f564, %f550;
+	mov.f32 	%f565, %f550;
+	mov.f32 	%f566, %f550;
+	mov.f32 	%f567, %f550;
+	mov.f32 	%f568, %f550;
+	mov.f32 	%f569, %f550;
+	mov.f32 	%f570, %f550;
+	mov.f32 	%f571, %f550;
+	mov.f32 	%f572, %f550;
+	mov.f32 	%f573, %f550;
+	mov.f32 	%f574, %f550;
+	mov.f32 	%f575, %f550;
+	mov.f32 	%f576, %f550;
+	mov.f32 	%f577, %f550;
+	mov.f32 	%f578, %f550;
+	mov.f32 	%f579, %f550;
+	mov.f32 	%f580, %f550;
+	mov.f32 	%f581, %f550;
+	mov.f32 	%f582, %f550;
+	mov.f32 	%f583, %f550;
+	mov.f32 	%f584, %f550;
+	mov.f32 	%f585, %f550;
+	mov.f32 	%f586, %f550;
+	mov.f32 	%f587, %f550;
+	mov.f32 	%f588, %f550;
+	mov.f32 	%f589, %f550;
+	mov.f32 	%f590, %f550;
+	mov.f32 	%f591, %f550;
+	mov.f32 	%f592, %f550;
+	mov.f32 	%f593, %f550;
+	mov.f32 	%f594, %f550;
+	mov.f32 	%f595, %f550;
+	mov.f32 	%f596, %f550;
+	mov.f32 	%f597, %f550;
+	mov.f32 	%f598, %f550;
+	mov.f32 	%f599, %f550;
+	mov.f32 	%f600, %f550;
+	mov.f32 	%f601, %f550;
+	mov.f32 	%f602, %f550;
+	mov.f32 	%f603, %f550;
+	mov.f32 	%f604, %f550;
+	mov.f32 	%f605, %f550;
+	mov.f32 	%f606, %f550;
+	mov.f32 	%f607, %f550;
+	mov.f32 	%f608, %f550;
+	mov.f32 	%f609, %f550;
+	mov.f32 	%f610, %f550;
+	mov.f32 	%f611, %f550;
+	mov.f32 	%f612, %f550;
+	mov.f32 	%f613, %f550;
+	bra.uni 	$L__BB0_1;
+$L__BB0_3:
+	.loc	1 40 40
+	add.s64 	%rd78, %rd4, %rd105;
+	.loc	1 40 34
+	add.s64 	%rd79, %rd78, 16;
+	add.s64 	%rd80, %rd3, %rd105;
+	.loc	1 40 52
+	add.s64 	%rd81, %rd80, 16;
+	mov.u32 %r65, 0x0;
+	mov.u32 %r66, 0x0;
+	mov.u32 %r67, 0x0;
+	mov.u32 %r68, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r65, %r66, %r67, %r68 }, [ %rd78 + 0 ];
+	@!%p1 mov.u32 %r65, %r342;
+	@!%p1 mov.u32 %r66, %r342;
+	@!%p1 mov.u32 %r67, %r342;
+	@!%p1 mov.u32 %r68, %r342;
+	mov.b32 	%f174, %r65;
+	mov.b32 	%f175, %r66;
+	mov.b32 	%f176, %r67;
+	mov.b32 	%f177, %r68;
+	mov.u32 %r73, 0x0;
+	mov.u32 %r74, 0x0;
+	mov.u32 %r75, 0x0;
+	mov.u32 %r76, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r73, %r74, %r75, %r76 }, [ %rd79 + 0 ];
+	@!%p1 mov.u32 %r73, %r342;
+	@!%p1 mov.u32 %r74, %r342;
+	@!%p1 mov.u32 %r75, %r342;
+	@!%p1 mov.u32 %r76, %r342;
+	mov.b32 	%f178, %r73;
+	mov.b32 	%f179, %r74;
+	mov.b32 	%f180, %r75;
+	mov.b32 	%f181, %r76;
+	mov.u32 %r81, 0x0;
+	mov.u32 %r82, 0x0;
+	mov.u32 %r83, 0x0;
+	mov.u32 %r84, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r81, %r82, %r83, %r84 }, [ %rd80 + 0 ];
+	@!%p1 mov.u32 %r81, %r342;
+	@!%p1 mov.u32 %r82, %r342;
+	@!%p1 mov.u32 %r83, %r342;
+	@!%p1 mov.u32 %r84, %r342;
+	mov.b32 	%f182, %r81;
+	mov.b32 	%f183, %r82;
+	mov.b32 	%f184, %r83;
+	mov.b32 	%f185, %r84;
+	mov.u32 %r89, 0x0;
+	mov.u32 %r90, 0x0;
+	mov.u32 %r91, 0x0;
+	mov.u32 %r92, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r89, %r90, %r91, %r92 }, [ %rd81 + 0 ];
+	@!%p1 mov.u32 %r89, %r342;
+	@!%p1 mov.u32 %r90, %r342;
+	@!%p1 mov.u32 %r91, %r342;
+	@!%p1 mov.u32 %r92, %r342;
+	mov.b32 	%f186, %r89;
+	mov.b32 	%f187, %r90;
+	mov.b32 	%f188, %r91;
+	mov.b32 	%f189, %r92;
+	.loc	1 41 22
+	add.f32 	%f190, %f65, %f174;
+	add.f32 	%f191, %f66, %f175;
+	add.f32 	%f192, %f67, %f176;
+	add.f32 	%f193, %f68, %f177;
+	add.f32 	%f194, %f69, %f178;
+	add.f32 	%f195, %f70, %f179;
+	add.f32 	%f196, %f71, %f180;
+	add.f32 	%f197, %f72, %f181;
+	add.f32 	%f198, %f73, %f182;
+	add.f32 	%f199, %f74, %f183;
+	add.f32 	%f200, %f75, %f184;
+	add.f32 	%f201, %f76, %f185;
+	add.f32 	%f202, %f77, %f186;
+	add.f32 	%f203, %f78, %f187;
+	add.f32 	%f204, %f79, %f188;
+	add.f32 	%f205, %f80, %f189;
+$L__tmp1:
+	.loc	2 96 20
+	sub.f32 	%f206, %f190, %f598;
+	sub.f32 	%f207, %f191, %f599;
+	sub.f32 	%f208, %f192, %f600;
+	sub.f32 	%f209, %f193, %f601;
+	sub.f32 	%f210, %f194, %f602;
+	sub.f32 	%f211, %f195, %f603;
+	sub.f32 	%f212, %f196, %f604;
+	sub.f32 	%f213, %f197, %f605;
+	sub.f32 	%f214, %f198, %f606;
+	sub.f32 	%f215, %f199, %f607;
+	sub.f32 	%f216, %f200, %f608;
+	sub.f32 	%f217, %f201, %f609;
+	sub.f32 	%f218, %f202, %f610;
+	sub.f32 	%f219, %f203, %f611;
+	sub.f32 	%f220, %f204, %f612;
+	sub.f32 	%f221, %f205, %f613;
+	.loc	2 97 26
+	add.f32 	%f550, %f550, 0f3F800000;
+	add.f32 	%f551, %f551, 0f3F800000;
+	add.f32 	%f552, %f552, 0f3F800000;
+	add.f32 	%f553, %f553, 0f3F800000;
+	add.f32 	%f554, %f554, 0f3F800000;
+	add.f32 	%f555, %f555, 0f3F800000;
+	add.f32 	%f556, %f556, 0f3F800000;
+	add.f32 	%f557, %f557, 0f3F800000;
+	add.f32 	%f558, %f558, 0f3F800000;
+	add.f32 	%f559, %f559, 0f3F800000;
+	add.f32 	%f560, %f560, 0f3F800000;
+	add.f32 	%f561, %f561, 0f3F800000;
+	add.f32 	%f562, %f562, 0f3F800000;
+	add.f32 	%f563, %f563, 0f3F800000;
+	add.f32 	%f564, %f564, 0f3F800000;
+	add.f32 	%f565, %f565, 0f3F800000;
+	add.f32 	%f566, %f566, 0f3F800000;
+	add.f32 	%f567, %f567, 0f3F800000;
+	add.f32 	%f568, %f568, 0f3F800000;
+	add.f32 	%f569, %f569, 0f3F800000;
+	add.f32 	%f570, %f570, 0f3F800000;
+	add.f32 	%f571, %f571, 0f3F800000;
+	add.f32 	%f572, %f572, 0f3F800000;
+	add.f32 	%f573, %f573, 0f3F800000;
+	add.f32 	%f574, %f574, 0f3F800000;
+	add.f32 	%f575, %f575, 0f3F800000;
+	add.f32 	%f576, %f576, 0f3F800000;
+	add.f32 	%f577, %f577, 0f3F800000;
+	add.f32 	%f578, %f578, 0f3F800000;
+	add.f32 	%f579, %f579, 0f3F800000;
+	add.f32 	%f580, %f580, 0f3F800000;
+	add.f32 	%f581, %f581, 0f3F800000;
+	.loc	2 98 30
+	mov.b32 	%r98, %f206;
+	mov.b32 	%r99, %f550;
+	div.full.f32 %r97, %r98, %r99;
+	mov.b32 	%f222, %r97;
+	mov.b32 	%r101, %f207;
+	mov.b32 	%r102, %f551;
+	div.full.f32 %r100, %r101, %r102;
+	mov.b32 	%f223, %r100;
+	mov.b32 	%r104, %f208;
+	mov.b32 	%r105, %f552;
+	div.full.f32 %r103, %r104, %r105;
+	mov.b32 	%f224, %r103;
+	mov.b32 	%r107, %f209;
+	mov.b32 	%r108, %f553;
+	div.full.f32 %r106, %r107, %r108;
+	mov.b32 	%f225, %r106;
+	mov.b32 	%r110, %f210;
+	mov.b32 	%r111, %f554;
+	div.full.f32 %r109, %r110, %r111;
+	mov.b32 	%f226, %r109;
+	mov.b32 	%r113, %f211;
+	mov.b32 	%r114, %f555;
+	div.full.f32 %r112, %r113, %r114;
+	mov.b32 	%f227, %r112;
+	mov.b32 	%r116, %f212;
+	mov.b32 	%r117, %f556;
+	div.full.f32 %r115, %r116, %r117;
+	mov.b32 	%f228, %r115;
+	mov.b32 	%r119, %f213;
+	mov.b32 	%r120, %f557;
+	div.full.f32 %r118, %r119, %r120;
+	mov.b32 	%f229, %r118;
+	mov.b32 	%r122, %f214;
+	mov.b32 	%r123, %f558;
+	div.full.f32 %r121, %r122, %r123;
+	mov.b32 	%f230, %r121;
+	mov.b32 	%r125, %f215;
+	mov.b32 	%r126, %f559;
+	div.full.f32 %r124, %r125, %r126;
+	mov.b32 	%f231, %r124;
+	mov.b32 	%r128, %f216;
+	mov.b32 	%r129, %f560;
+	div.full.f32 %r127, %r128, %r129;
+	mov.b32 	%f232, %r127;
+	mov.b32 	%r131, %f217;
+	mov.b32 	%r132, %f561;
+	div.full.f32 %r130, %r131, %r132;
+	mov.b32 	%f233, %r130;
+	mov.b32 	%r134, %f218;
+	mov.b32 	%r135, %f562;
+	div.full.f32 %r133, %r134, %r135;
+	mov.b32 	%f234, %r133;
+	mov.b32 	%r137, %f219;
+	mov.b32 	%r138, %f563;
+	div.full.f32 %r136, %r137, %r138;
+	mov.b32 	%f235, %r136;
+	mov.b32 	%r140, %f220;
+	mov.b32 	%r141, %f564;
+	div.full.f32 %r139, %r140, %r141;
+	mov.b32 	%f236, %r139;
+	mov.b32 	%r143, %f221;
+	mov.b32 	%r144, %f565;
+	div.full.f32 %r142, %r143, %r144;
+	mov.b32 	%f237, %r142;
+	.loc	2 98 22
+	add.f32 	%f598, %f598, %f222;
+	add.f32 	%f599, %f599, %f223;
+	add.f32 	%f600, %f600, %f224;
+	add.f32 	%f601, %f601, %f225;
+	add.f32 	%f602, %f602, %f226;
+	add.f32 	%f603, %f603, %f227;
+	add.f32 	%f604, %f604, %f228;
+	add.f32 	%f605, %f605, %f229;
+	add.f32 	%f606, %f606, %f230;
+	add.f32 	%f607, %f607, %f231;
+	add.f32 	%f608, %f608, %f232;
+	add.f32 	%f609, %f609, %f233;
+	add.f32 	%f610, %f610, %f234;
+	add.f32 	%f611, %f611, %f235;
+	add.f32 	%f612, %f612, %f236;
+	add.f32 	%f613, %f613, %f237;
+	.loc	2 101 30
+	sub.f32 	%f238, %f190, %f598;
+	sub.f32 	%f239, %f191, %f599;
+	sub.f32 	%f240, %f192, %f600;
+	sub.f32 	%f241, %f193, %f601;
+	sub.f32 	%f242, %f194, %f602;
+	sub.f32 	%f243, %f195, %f603;
+	sub.f32 	%f244, %f196, %f604;
+	sub.f32 	%f245, %f197, %f605;
+	sub.f32 	%f246, %f198, %f606;
+	sub.f32 	%f247, %f199, %f607;
+	sub.f32 	%f248, %f200, %f608;
+	sub.f32 	%f249, %f201, %f609;
+	sub.f32 	%f250, %f202, %f610;
+	sub.f32 	%f251, %f203, %f611;
+	sub.f32 	%f252, %f204, %f612;
+	sub.f32 	%f253, %f205, %f613;
+$L__tmp2:
+	.loc	1 47 48
+	fma.rn.f32 	%f582, %f206, %f238, %f582;
+	fma.rn.f32 	%f583, %f207, %f239, %f583;
+	fma.rn.f32 	%f584, %f208, %f240, %f584;
+	fma.rn.f32 	%f585, %f209, %f241, %f585;
+	fma.rn.f32 	%f586, %f210, %f242, %f586;
+	fma.rn.f32 	%f587, %f211, %f243, %f587;
+	fma.rn.f32 	%f588, %f212, %f244, %f588;
+	fma.rn.f32 	%f589, %f213, %f245, %f589;
+	fma.rn.f32 	%f590, %f214, %f246, %f590;
+	fma.rn.f32 	%f591, %f215, %f247, %f591;
+	fma.rn.f32 	%f592, %f216, %f248, %f592;
+	fma.rn.f32 	%f593, %f217, %f249, %f593;
+	fma.rn.f32 	%f594, %f218, %f250, %f594;
+	fma.rn.f32 	%f595, %f219, %f251, %f595;
+	fma.rn.f32 	%f596, %f220, %f252, %f596;
+	fma.rn.f32 	%f597, %f221, %f253, %f597;
+	.loc	1 31 36
+	add.s64 	%rd105, %rd105, 256;
+	add.s32 	%r406, %r406, 64;
+	setp.lt.u32 	%p62, %r406, 192;
+	@%p62 bra 	$L__BB0_1;
+	bra.uni 	$L__BB0_4;
+$L__BB0_1:
+	.loc	1 39 40
+	setp.lt.u64 	%p41, %rd1, 50257;
+	.loc	1 35 34
+	add.s64 	%rd67, %rd6, %rd105;
+	add.s64 	%rd68, %rd67, 16;
+	add.s64 	%rd69, %rd5, %rd105;
+	.loc	1 35 50
+	add.s64 	%rd70, %rd69, 16;
+	mov.b32 	%r342, 0;
+	mov.u32 %r32, 0x0;
+	mov.u32 %r33, 0x0;
+	mov.u32 %r34, 0x0;
+	mov.u32 %r35, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r32, %r33, %r34, %r35 }, [ %rd67 + 0 ];
+	@!%p1 mov.u32 %r32, %r342;
+	@!%p1 mov.u32 %r33, %r342;
+	@!%p1 mov.u32 %r34, %r342;
+	@!%p1 mov.u32 %r35, %r342;
+	mov.b32 	%f65, %r32;
+	mov.b32 	%f66, %r33;
+	mov.b32 	%f67, %r34;
+	mov.b32 	%f68, %r35;
+	mov.u32 %r40, 0x0;
+	mov.u32 %r41, 0x0;
+	mov.u32 %r42, 0x0;
+	mov.u32 %r43, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r40, %r41, %r42, %r43 }, [ %rd68 + 0 ];
+	@!%p1 mov.u32 %r40, %r342;
+	@!%p1 mov.u32 %r41, %r342;
+	@!%p1 mov.u32 %r42, %r342;
+	@!%p1 mov.u32 %r43, %r342;
+	mov.b32 	%f69, %r40;
+	mov.b32 	%f70, %r41;
+	mov.b32 	%f71, %r42;
+	mov.b32 	%f72, %r43;
+	mov.u32 %r48, 0x0;
+	mov.u32 %r49, 0x0;
+	mov.u32 %r50, 0x0;
+	mov.u32 %r51, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r48, %r49, %r50, %r51 }, [ %rd69 + 0 ];
+	@!%p1 mov.u32 %r48, %r342;
+	@!%p1 mov.u32 %r49, %r342;
+	@!%p1 mov.u32 %r50, %r342;
+	@!%p1 mov.u32 %r51, %r342;
+	mov.b32 	%f73, %r48;
+	mov.b32 	%f74, %r49;
+	mov.b32 	%f75, %r50;
+	mov.b32 	%f76, %r51;
+	mov.u32 %r56, 0x0;
+	mov.u32 %r57, 0x0;
+	mov.u32 %r58, 0x0;
+	mov.u32 %r59, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r56, %r57, %r58, %r59 }, [ %rd70 + 0 ];
+	@!%p1 mov.u32 %r56, %r342;
+	@!%p1 mov.u32 %r57, %r342;
+	@!%p1 mov.u32 %r58, %r342;
+	@!%p1 mov.u32 %r59, %r342;
+	mov.b32 	%f77, %r56;
+	mov.b32 	%f78, %r57;
+	mov.b32 	%f79, %r58;
+	mov.b32 	%f80, %r59;
+	mov.b32 	%r405, 883;
+	mov.u64 	%rd104, 1;
+	.loc	1 39 55
+	@%p41 bra 	$L__BB0_3;
+	mov.u64 	%rd71, assertMessage_0;
+	cvta.global.u64 	%rd72, %rd71;
+	mov.u64 	%rd73, assertFile_0;
+	cvta.global.u64 	%rd74, %rd73;
+	mov.u64 	%rd75, assertFunc_0;
+	cvta.global.u64 	%rd76, %rd75;
+	{ // callseq 6, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd72;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd74;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r405;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd76;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd104;
+	call.uni
+	__assertfail,
+	(
+	param0,
+	param1,
+	param2,
+	param3,
+	param4
+	);
+	} // callseq 6
+	bra.uni 	$L__BB0_3;
+$L__BB0_4:
+	.loc	1 31 36
+	and.b32  	%r254, %r4, 3;
+	mad.lo.s32 	%r255, %r254, 72, %r2;
+	shl.b32 	%r256, %r255, 2;
+	mov.u32 	%r257, global_smem;
+	add.s32 	%r258, %r257, %r256;
+	st.shared.f32 	[%r258], %f566;
+	st.shared.f32 	[%r258+1152], %f567;
+	st.shared.f32 	[%r258+2304], %f568;
+	st.shared.f32 	[%r258+3456], %f569;
+	st.shared.f32 	[%r258+4608], %f570;
+	st.shared.f32 	[%r258+5760], %f571;
+	st.shared.f32 	[%r258+6912], %f572;
+	st.shared.f32 	[%r258+8064], %f573;
+	bar.sync 	0;
+	mad.lo.s32 	%r259, %r1, 72, %r3;
+	shl.b32 	%r260, %r259, 2;
+	add.s32 	%r261, %r257, %r260;
+	ld.shared.v4.f32 	{%f254, %f255, %f256, %f257}, [%r261];
+	ld.shared.v4.f32 	{%f258, %f259, %f260, %f261}, [%r261+16];
+	bar.sync 	0;
+	st.shared.f32 	[%r258], %f574;
+	st.shared.f32 	[%r258+1152], %f575;
+	st.shared.f32 	[%r258+2304], %f576;
+	st.shared.f32 	[%r258+3456], %f577;
+	st.shared.f32 	[%r258+4608], %f578;
+	st.shared.f32 	[%r258+5760], %f579;
+	st.shared.f32 	[%r258+6912], %f580;
+	st.shared.f32 	[%r258+8064], %f581;
+	bar.sync 	0;
+	ld.shared.v4.f32 	{%f262, %f263, %f264, %f265}, [%r261];
+	ld.shared.v4.f32 	{%f266, %f267, %f268, %f269}, [%r261+16];
+$L__tmp3:
+	.loc	2 108 21
+	sub.f32 	%f270, %f599, %f598;
+	.loc	2 109 28
+	add.f32 	%f271, %f254, %f255;
+	.loc	2 110 39
+	setp.eq.f32 	%p63, %f271, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r146, %f255;
+	mov.b32 	%r147, %f271;
+	div.full.f32 %r145, %r146, %r147;
+	mov.b32 	%f272, %r145;
+	.loc	2 110 49
+	selp.f32 	%f273, 0f00000000, %f272, %p63;
+	.loc	2 112 17
+	fma.rn.f32 	%f274, %f270, %f273, %f598;
+	.loc	2 113 15
+	add.f32 	%f275, %f582, %f583;
+	.loc	2 113 30
+	mul.f32 	%f276, %f270, %f270;
+	.loc	2 113 38
+	mul.f32 	%f277, %f276, %f254;
+	.loc	2 113 22
+	fma.rn.f32 	%f278, %f277, %f273, %f275;
+	.loc	2 108 21
+	sub.f32 	%f279, %f600, %f274;
+	.loc	2 109 28
+	add.f32 	%f280, %f256, %f271;
+	.loc	2 110 39
+	setp.eq.f32 	%p64, %f280, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r150, %f280;
+	mov.b32 	%r149, %f256;
+	div.full.f32 %r148, %r149, %r150;
+	mov.b32 	%f281, %r148;
+	.loc	2 110 49
+	selp.f32 	%f282, 0f00000000, %f281, %p64;
+	.loc	2 112 17
+	fma.rn.f32 	%f283, %f282, %f279, %f274;
+	.loc	2 113 15
+	add.f32 	%f284, %f584, %f278;
+	.loc	2 113 30
+	mul.f32 	%f285, %f279, %f279;
+	.loc	2 113 38
+	mul.f32 	%f286, %f271, %f285;
+	.loc	2 113 22
+	fma.rn.f32 	%f287, %f282, %f286, %f284;
+	.loc	2 108 21
+	sub.f32 	%f288, %f601, %f283;
+	.loc	2 109 28
+	add.f32 	%f289, %f257, %f280;
+	.loc	2 110 39
+	setp.eq.f32 	%p65, %f289, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r153, %f289;
+	mov.b32 	%r152, %f257;
+	div.full.f32 %r151, %r152, %r153;
+	mov.b32 	%f290, %r151;
+	.loc	2 110 49
+	selp.f32 	%f291, 0f00000000, %f290, %p65;
+	.loc	2 112 17
+	fma.rn.f32 	%f292, %f291, %f288, %f283;
+	.loc	2 113 15
+	add.f32 	%f293, %f585, %f287;
+	.loc	2 113 30
+	mul.f32 	%f294, %f288, %f288;
+	.loc	2 113 38
+	mul.f32 	%f295, %f280, %f294;
+	.loc	2 113 22
+	fma.rn.f32 	%f296, %f291, %f295, %f293;
+	.loc	2 108 21
+	sub.f32 	%f297, %f602, %f292;
+	.loc	2 109 28
+	add.f32 	%f298, %f258, %f289;
+	.loc	2 110 39
+	setp.eq.f32 	%p66, %f298, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r156, %f298;
+	mov.b32 	%r155, %f258;
+	div.full.f32 %r154, %r155, %r156;
+	mov.b32 	%f299, %r154;
+	.loc	2 110 49
+	selp.f32 	%f300, 0f00000000, %f299, %p66;
+	.loc	2 112 17
+	fma.rn.f32 	%f301, %f300, %f297, %f292;
+	.loc	2 113 15
+	add.f32 	%f302, %f586, %f296;
+	.loc	2 113 30
+	mul.f32 	%f303, %f297, %f297;
+	.loc	2 113 38
+	mul.f32 	%f304, %f289, %f303;
+	.loc	2 113 22
+	fma.rn.f32 	%f305, %f300, %f304, %f302;
+	.loc	2 108 21
+	sub.f32 	%f306, %f603, %f301;
+	.loc	2 109 28
+	add.f32 	%f307, %f259, %f298;
+	.loc	2 110 39
+	setp.eq.f32 	%p67, %f307, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r159, %f307;
+	mov.b32 	%r158, %f259;
+	div.full.f32 %r157, %r158, %r159;
+	mov.b32 	%f308, %r157;
+	.loc	2 110 49
+	selp.f32 	%f309, 0f00000000, %f308, %p67;
+	.loc	2 112 17
+	fma.rn.f32 	%f310, %f309, %f306, %f301;
+	.loc	2 113 15
+	add.f32 	%f311, %f587, %f305;
+	.loc	2 113 30
+	mul.f32 	%f312, %f306, %f306;
+	.loc	2 113 38
+	mul.f32 	%f313, %f298, %f312;
+	.loc	2 113 22
+	fma.rn.f32 	%f314, %f309, %f313, %f311;
+	.loc	2 108 21
+	sub.f32 	%f315, %f604, %f310;
+	.loc	2 109 28
+	add.f32 	%f316, %f260, %f307;
+	.loc	2 110 39
+	setp.eq.f32 	%p68, %f316, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r162, %f316;
+	mov.b32 	%r161, %f260;
+	div.full.f32 %r160, %r161, %r162;
+	mov.b32 	%f317, %r160;
+	.loc	2 110 49
+	selp.f32 	%f318, 0f00000000, %f317, %p68;
+	.loc	2 112 17
+	fma.rn.f32 	%f319, %f318, %f315, %f310;
+	.loc	2 113 15
+	add.f32 	%f320, %f588, %f314;
+	.loc	2 113 30
+	mul.f32 	%f321, %f315, %f315;
+	.loc	2 113 38
+	mul.f32 	%f322, %f307, %f321;
+	.loc	2 113 22
+	fma.rn.f32 	%f323, %f318, %f322, %f320;
+	.loc	2 108 21
+	sub.f32 	%f324, %f605, %f319;
+	.loc	2 109 28
+	add.f32 	%f325, %f261, %f316;
+	.loc	2 110 39
+	setp.eq.f32 	%p69, %f325, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r165, %f325;
+	mov.b32 	%r164, %f261;
+	div.full.f32 %r163, %r164, %r165;
+	mov.b32 	%f326, %r163;
+	.loc	2 110 49
+	selp.f32 	%f327, 0f00000000, %f326, %p69;
+	.loc	2 112 17
+	fma.rn.f32 	%f328, %f327, %f324, %f319;
+	.loc	2 113 15
+	add.f32 	%f329, %f589, %f323;
+	.loc	2 113 30
+	mul.f32 	%f330, %f324, %f324;
+	.loc	2 113 38
+	mul.f32 	%f331, %f316, %f330;
+	.loc	2 113 22
+	fma.rn.f32 	%f332, %f327, %f331, %f329;
+	.loc	2 108 21
+	sub.f32 	%f333, %f607, %f606;
+	.loc	2 109 28
+	add.f32 	%f334, %f262, %f263;
+	.loc	2 110 39
+	setp.eq.f32 	%p70, %f334, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r167, %f263;
+	mov.b32 	%r168, %f334;
+	div.full.f32 %r166, %r167, %r168;
+	mov.b32 	%f335, %r166;
+	.loc	2 110 49
+	selp.f32 	%f336, 0f00000000, %f335, %p70;
+	.loc	2 112 17
+	fma.rn.f32 	%f337, %f333, %f336, %f606;
+	.loc	2 113 15
+	add.f32 	%f338, %f590, %f591;
+	.loc	2 113 30
+	mul.f32 	%f339, %f333, %f333;
+	.loc	2 113 38
+	mul.f32 	%f340, %f339, %f262;
+	.loc	2 113 22
+	fma.rn.f32 	%f341, %f340, %f336, %f338;
+	.loc	2 108 21
+	sub.f32 	%f342, %f608, %f337;
+	.loc	2 109 28
+	add.f32 	%f343, %f264, %f334;
+	.loc	2 110 39
+	setp.eq.f32 	%p71, %f343, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r171, %f343;
+	mov.b32 	%r170, %f264;
+	div.full.f32 %r169, %r170, %r171;
+	mov.b32 	%f344, %r169;
+	.loc	2 110 49
+	selp.f32 	%f345, 0f00000000, %f344, %p71;
+	.loc	2 112 17
+	fma.rn.f32 	%f346, %f345, %f342, %f337;
+	.loc	2 113 15
+	add.f32 	%f347, %f592, %f341;
+	.loc	2 113 30
+	mul.f32 	%f348, %f342, %f342;
+	.loc	2 113 38
+	mul.f32 	%f349, %f334, %f348;
+	.loc	2 113 22
+	fma.rn.f32 	%f350, %f345, %f349, %f347;
+	.loc	2 108 21
+	sub.f32 	%f351, %f609, %f346;
+	.loc	2 109 28
+	add.f32 	%f352, %f265, %f343;
+	.loc	2 110 39
+	setp.eq.f32 	%p72, %f352, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r174, %f352;
+	mov.b32 	%r173, %f265;
+	div.full.f32 %r172, %r173, %r174;
+	mov.b32 	%f353, %r172;
+	.loc	2 110 49
+	selp.f32 	%f354, 0f00000000, %f353, %p72;
+	.loc	2 112 17
+	fma.rn.f32 	%f355, %f354, %f351, %f346;
+	.loc	2 113 15
+	add.f32 	%f356, %f593, %f350;
+	.loc	2 113 30
+	mul.f32 	%f357, %f351, %f351;
+	.loc	2 113 38
+	mul.f32 	%f358, %f343, %f357;
+	.loc	2 113 22
+	fma.rn.f32 	%f359, %f354, %f358, %f356;
+	.loc	2 108 21
+	sub.f32 	%f360, %f610, %f355;
+	.loc	2 109 28
+	add.f32 	%f361, %f266, %f352;
+	.loc	2 110 39
+	setp.eq.f32 	%p73, %f361, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r177, %f361;
+	mov.b32 	%r176, %f266;
+	div.full.f32 %r175, %r176, %r177;
+	mov.b32 	%f362, %r175;
+	.loc	2 110 49
+	selp.f32 	%f363, 0f00000000, %f362, %p73;
+	.loc	2 112 17
+	fma.rn.f32 	%f364, %f363, %f360, %f355;
+	.loc	2 113 15
+	add.f32 	%f365, %f594, %f359;
+	.loc	2 113 30
+	mul.f32 	%f366, %f360, %f360;
+	.loc	2 113 38
+	mul.f32 	%f367, %f352, %f366;
+	.loc	2 113 22
+	fma.rn.f32 	%f368, %f363, %f367, %f365;
+	.loc	2 108 21
+	sub.f32 	%f369, %f611, %f364;
+	.loc	2 109 28
+	add.f32 	%f370, %f267, %f361;
+	.loc	2 110 39
+	setp.eq.f32 	%p74, %f370, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r180, %f370;
+	mov.b32 	%r179, %f267;
+	div.full.f32 %r178, %r179, %r180;
+	mov.b32 	%f371, %r178;
+	.loc	2 110 49
+	selp.f32 	%f372, 0f00000000, %f371, %p74;
+	.loc	2 112 17
+	fma.rn.f32 	%f373, %f372, %f369, %f364;
+	.loc	2 113 15
+	add.f32 	%f374, %f595, %f368;
+	.loc	2 113 30
+	mul.f32 	%f375, %f369, %f369;
+	.loc	2 113 38
+	mul.f32 	%f376, %f361, %f375;
+	.loc	2 113 22
+	fma.rn.f32 	%f377, %f372, %f376, %f374;
+	.loc	2 108 21
+	sub.f32 	%f378, %f612, %f373;
+	.loc	2 109 28
+	add.f32 	%f379, %f268, %f370;
+	.loc	2 110 39
+	setp.eq.f32 	%p75, %f379, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r183, %f379;
+	mov.b32 	%r182, %f268;
+	div.full.f32 %r181, %r182, %r183;
+	mov.b32 	%f380, %r181;
+	.loc	2 110 49
+	selp.f32 	%f381, 0f00000000, %f380, %p75;
+	.loc	2 112 17
+	fma.rn.f32 	%f382, %f381, %f378, %f373;
+	.loc	2 113 15
+	add.f32 	%f383, %f596, %f377;
+	.loc	2 113 30
+	mul.f32 	%f384, %f378, %f378;
+	.loc	2 113 38
+	mul.f32 	%f385, %f370, %f384;
+	.loc	2 113 22
+	fma.rn.f32 	%f386, %f381, %f385, %f383;
+	.loc	2 108 21
+	sub.f32 	%f387, %f613, %f382;
+	.loc	2 109 28
+	add.f32 	%f388, %f269, %f379;
+	.loc	2 110 39
+	setp.eq.f32 	%p76, %f388, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r186, %f388;
+	mov.b32 	%r185, %f269;
+	div.full.f32 %r184, %r185, %r186;
+	mov.b32 	%f389, %r184;
+	.loc	2 110 49
+	selp.f32 	%f390, 0f00000000, %f389, %p76;
+	.loc	2 112 17
+	fma.rn.f32 	%f391, %f390, %f387, %f382;
+	.loc	2 113 15
+	add.f32 	%f392, %f597, %f386;
+	.loc	2 113 30
+	mul.f32 	%f393, %f387, %f387;
+	.loc	2 113 38
+	mul.f32 	%f394, %f379, %f393;
+	.loc	2 113 22
+	fma.rn.f32 	%f395, %f390, %f394, %f392;
+$L__tmp4:
+	.loc	2 120 46
+	mov.b32 	%r262, %f328;
+	shfl.sync.bfly.b32	%r263, %r262, 4, 31, -1;
+	mov.b32 	%f396, %r263;
+	mov.b32 	%r264, %f332;
+	shfl.sync.bfly.b32	%r265, %r264, 4, 31, -1;
+	mov.b32 	%f397, %r265;
+	shfl.sync.bfly.b32	%r188, %r165, 4, 31, -1;
+	mov.b32 	%f398, %r188;
+$L__tmp5:
+	.loc	2 108 21
+	sub.f32 	%f399, %f396, %f328;
+	.loc	2 109 28
+	add.f32 	%f400, %f325, %f398;
+	.loc	2 110 39
+	setp.eq.f32 	%p77, %f400, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r189, %f400;
+	div.full.f32 %r187, %r188, %r189;
+	mov.b32 	%f401, %r187;
+	.loc	2 110 49
+	selp.f32 	%f402, 0f00000000, %f401, %p77;
+	.loc	2 112 17
+	fma.rn.f32 	%f403, %f402, %f399, %f328;
+	.loc	2 113 15
+	add.f32 	%f404, %f332, %f397;
+	.loc	2 113 30
+	mul.f32 	%f405, %f399, %f399;
+	.loc	2 113 38
+	mul.f32 	%f406, %f325, %f405;
+	.loc	2 113 22
+	fma.rn.f32 	%f407, %f402, %f406, %f404;
+$L__tmp6:
+	.loc	2 120 46
+	mov.b32 	%r266, %f403;
+	shfl.sync.bfly.b32	%r267, %r266, 2, 31, -1;
+	mov.b32 	%f408, %r267;
+	mov.b32 	%r268, %f407;
+	shfl.sync.bfly.b32	%r269, %r268, 2, 31, -1;
+	mov.b32 	%f409, %r269;
+	shfl.sync.bfly.b32	%r191, %r189, 2, 31, -1;
+	mov.b32 	%f410, %r191;
+$L__tmp7:
+	.loc	2 108 21
+	sub.f32 	%f411, %f408, %f403;
+	.loc	2 109 28
+	add.f32 	%f412, %f400, %f410;
+	.loc	2 110 39
+	setp.eq.f32 	%p78, %f412, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r192, %f412;
+	div.full.f32 %r190, %r191, %r192;
+	mov.b32 	%f413, %r190;
+	.loc	2 110 49
+	selp.f32 	%f414, 0f00000000, %f413, %p78;
+	.loc	2 112 17
+	fma.rn.f32 	%f415, %f414, %f411, %f403;
+	.loc	2 113 15
+	add.f32 	%f416, %f407, %f409;
+	.loc	2 113 30
+	mul.f32 	%f417, %f411, %f411;
+	.loc	2 113 38
+	mul.f32 	%f418, %f400, %f417;
+	.loc	2 113 22
+	fma.rn.f32 	%f419, %f414, %f418, %f416;
+$L__tmp8:
+	.loc	2 120 46
+	mov.b32 	%r270, %f415;
+	shfl.sync.bfly.b32	%r271, %r270, 1, 31, -1;
+	mov.b32 	%f420, %r271;
+	mov.b32 	%r272, %f419;
+	shfl.sync.bfly.b32	%r273, %r272, 1, 31, -1;
+	mov.b32 	%f421, %r273;
+	shfl.sync.bfly.b32	%r194, %r192, 1, 31, -1;
+	mov.b32 	%f422, %r194;
+$L__tmp9:
+	.loc	2 108 21
+	sub.f32 	%f423, %f420, %f415;
+	.loc	2 109 28
+	add.f32 	%f424, %f412, %f422;
+	.loc	2 110 39
+	setp.eq.f32 	%p79, %f424, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r195, %f424;
+	div.full.f32 %r193, %r194, %r195;
+	mov.b32 	%f425, %r193;
+	.loc	2 110 49
+	selp.f32 	%f426, 0f00000000, %f425, %p79;
+	.loc	2 112 17
+	fma.rn.f32 	%f145, %f423, %f426, %f415;
+	.loc	2 113 15
+	add.f32 	%f427, %f419, %f421;
+	.loc	2 113 30
+	mul.f32 	%f428, %f423, %f423;
+	.loc	2 113 38
+	mul.f32 	%f429, %f412, %f428;
+	.loc	2 113 22
+	fma.rn.f32 	%f430, %f426, %f429, %f427;
+$L__tmp10:
+	.loc	2 120 46
+	mov.b32 	%r274, %f391;
+	shfl.sync.bfly.b32	%r275, %r274, 4, 31, -1;
+	mov.b32 	%f431, %r275;
+	mov.b32 	%r276, %f395;
+	shfl.sync.bfly.b32	%r277, %r276, 4, 31, -1;
+	mov.b32 	%f432, %r277;
+	shfl.sync.bfly.b32	%r197, %r186, 4, 31, -1;
+	mov.b32 	%f433, %r197;
+$L__tmp11:
+	.loc	2 108 21
+	sub.f32 	%f434, %f431, %f391;
+	.loc	2 109 28
+	add.f32 	%f435, %f388, %f433;
+	.loc	2 110 39
+	setp.eq.f32 	%p80, %f435, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r198, %f435;
+	div.full.f32 %r196, %r197, %r198;
+	mov.b32 	%f436, %r196;
+	.loc	2 110 49
+	selp.f32 	%f437, 0f00000000, %f436, %p80;
+	.loc	2 112 17
+	fma.rn.f32 	%f438, %f434, %f437, %f391;
+	.loc	2 113 15
+	add.f32 	%f439, %f395, %f432;
+	.loc	2 113 30
+	mul.f32 	%f440, %f434, %f434;
+	.loc	2 113 38
+	mul.f32 	%f441, %f388, %f440;
+	.loc	2 113 22
+	fma.rn.f32 	%f442, %f441, %f437, %f439;
+$L__tmp12:
+	.loc	2 120 46
+	mov.b32 	%r278, %f438;
+	shfl.sync.bfly.b32	%r279, %r278, 2, 31, -1;
+	mov.b32 	%f443, %r279;
+	mov.b32 	%r280, %f442;
+	shfl.sync.bfly.b32	%r281, %r280, 2, 31, -1;
+	mov.b32 	%f444, %r281;
+	shfl.sync.bfly.b32	%r200, %r198, 2, 31, -1;
+	mov.b32 	%f445, %r200;
+$L__tmp13:
+	.loc	2 108 21
+	sub.f32 	%f446, %f443, %f438;
+	.loc	2 109 28
+	add.f32 	%f447, %f435, %f445;
+	.loc	2 110 39
+	setp.eq.f32 	%p81, %f447, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r201, %f447;
+	div.full.f32 %r199, %r200, %r201;
+	mov.b32 	%f448, %r199;
+	.loc	2 110 49
+	selp.f32 	%f449, 0f00000000, %f448, %p81;
+	.loc	2 112 17
+	fma.rn.f32 	%f450, %f446, %f449, %f438;
+	.loc	2 113 15
+	add.f32 	%f451, %f442, %f444;
+	.loc	2 113 30
+	mul.f32 	%f452, %f446, %f446;
+	.loc	2 113 38
+	mul.f32 	%f453, %f435, %f452;
+	.loc	2 113 22
+	fma.rn.f32 	%f454, %f449, %f453, %f451;
+$L__tmp14:
+	.loc	2 120 46
+	mov.b32 	%r282, %f450;
+	shfl.sync.bfly.b32	%r283, %r282, 1, 31, -1;
+	mov.b32 	%f455, %r283;
+	mov.b32 	%r284, %f454;
+	shfl.sync.bfly.b32	%r285, %r284, 1, 31, -1;
+	mov.b32 	%f456, %r285;
+	shfl.sync.bfly.b32	%r203, %r201, 1, 31, -1;
+	mov.b32 	%f457, %r203;
+$L__tmp15:
+	.loc	2 108 21
+	sub.f32 	%f458, %f455, %f450;
+	.loc	2 109 28
+	add.f32 	%f459, %f447, %f457;
+	.loc	2 110 39
+	setp.eq.f32 	%p82, %f459, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r204, %f459;
+	div.full.f32 %r202, %r203, %r204;
+	mov.b32 	%f460, %r202;
+	.loc	2 110 49
+	selp.f32 	%f461, 0f00000000, %f460, %p82;
+	.loc	2 112 17
+	fma.rn.f32 	%f146, %f458, %f461, %f450;
+	.loc	2 113 15
+	add.f32 	%f462, %f454, %f456;
+	.loc	2 113 30
+	mul.f32 	%f463, %f458, %f458;
+	.loc	2 113 38
+	mul.f32 	%f464, %f447, %f463;
+	.loc	2 113 22
+	fma.rn.f32 	%f465, %f461, %f464, %f462;
+$L__tmp16:
+	.loc	1 69 23
+	mov.b32 	%r206, %f430;
+	mov.b32 	%r207, 1132462080;
+	div.full.f32 %r205, %r206, %r207;
+	mov.b32 	%f466, %r205;
+	mov.b32 	%r230, %f465;
+	div.full.f32 %r229, %r230, %r207;
+	mov.b32 	%f467, %r229;
+	.loc	1 71 24
+	add.f32 	%f147, %f466, 0f3727C5AC;
+	add.f32 	%f148, %f467, 0f3727C5AC;
+	.loc	1 55 36
+	add.s64 	%rd9, %rd12, %rd2;
+	shl.b32 	%r286, %r11, 14;
+	shl.b32 	%r287, %r1, 8;
+	or.b32  	%r288, %r286, %r287;
+	or.b32  	%r8, %r288, %r3;
+	mov.u64 	%rd106, 0;
+	mov.b32 	%r407, -64;
+	rsqrt.approx.ftz.f32 	%f516, %f147;
+	rsqrt.approx.ftz.f32 	%f517, %f148;
+	bra.uni 	$L__BB0_5;
+$L__BB0_7:
+	.loc	1 65 35
+	add.s64 	%rd96, %rd4, %rd106;
+	add.s64 	%rd97, %rd96, 16;
+	add.s64 	%rd98, %rd3, %rd106;
+	.loc	1 65 54
+	add.s64 	%rd99, %rd98, 16;
+	mov.u32 %r338, 0x0;
+	mov.u32 %r339, 0x0;
+	mov.u32 %r340, 0x0;
+	mov.u32 %r341, 0x0;
+	@%p1 ld.global.L1::evict_first.v4.b32 { %r338, %r339, %r340, %r341 }, [ %rd96 + 0 ];
+	@!%p1 mov.u32 %r338, %r342;
+	@!%p1 mov.u32 %r339, %r342;
+	@!%p1 mov.u32 %r340, %r342;
+	@!%p1 mov.u32 %r341, %r342;
+	mov.b32 	%f468, %r338;
+	mov.b32 	%f469, %r339;
+	mov.b32 	%f470, %r340;
+	mov.b32 	%f471, %r341;
+	mov.u32 %r346, 0x0;
+	mov.u32 %r347, 0x0;
+	mov.u32 %r348, 0x0;
+	mov.u32 %r349, 0x0;
+	@%p1 ld.global.L1::evict_first.v4.b32 { %r346, %r347, %r348, %r349 }, [ %rd97 + 0 ];
+	@!%p1 mov.u32 %r346, %r342;
+	@!%p1 mov.u32 %r347, %r342;
+	@!%p1 mov.u32 %r348, %r342;
+	@!%p1 mov.u32 %r349, %r342;
+	mov.b32 	%f472, %r346;
+	mov.b32 	%f473, %r347;
+	mov.b32 	%f474, %r348;
+	mov.b32 	%f475, %r349;
+	mov.u32 %r354, 0x0;
+	mov.u32 %r355, 0x0;
+	mov.u32 %r356, 0x0;
+	mov.u32 %r357, 0x0;
+	@%p1 ld.global.L1::evict_first.v4.b32 { %r354, %r355, %r356, %r357 }, [ %rd98 + 0 ];
+	@!%p1 mov.u32 %r354, %r342;
+	@!%p1 mov.u32 %r355, %r342;
+	@!%p1 mov.u32 %r356, %r342;
+	@!%p1 mov.u32 %r357, %r342;
+	mov.b32 	%f476, %r354;
+	mov.b32 	%f477, %r355;
+	mov.b32 	%f478, %r356;
+	mov.b32 	%f479, %r357;
+	mov.u32 %r362, 0x0;
+	mov.u32 %r363, 0x0;
+	mov.u32 %r364, 0x0;
+	mov.u32 %r365, 0x0;
+	@%p1 ld.global.L1::evict_first.v4.b32 { %r362, %r363, %r364, %r365 }, [ %rd99 + 0 ];
+	@!%p1 mov.u32 %r362, %r342;
+	@!%p1 mov.u32 %r363, %r342;
+	@!%p1 mov.u32 %r364, %r342;
+	@!%p1 mov.u32 %r365, %r342;
+	mov.b32 	%f480, %r362;
+	mov.b32 	%f481, %r363;
+	mov.b32 	%f482, %r364;
+	mov.b32 	%f483, %r365;
+	.loc	1 66 24
+	add.f32 	%f484, %f149, %f468;
+	add.f32 	%f485, %f150, %f469;
+	add.f32 	%f486, %f151, %f470;
+	add.f32 	%f487, %f152, %f471;
+	add.f32 	%f488, %f153, %f472;
+	add.f32 	%f489, %f154, %f473;
+	add.f32 	%f490, %f155, %f474;
+	add.f32 	%f491, %f156, %f475;
+	add.f32 	%f492, %f157, %f476;
+	add.f32 	%f493, %f158, %f477;
+	add.f32 	%f494, %f159, %f478;
+	add.f32 	%f495, %f160, %f479;
+	add.f32 	%f496, %f161, %f480;
+	add.f32 	%f497, %f162, %f481;
+	add.f32 	%f498, %f163, %f482;
+	add.f32 	%f499, %f164, %f483;
+	.loc	1 67 24
+	sub.f32 	%f500, %f484, %f145;
+	sub.f32 	%f501, %f485, %f145;
+	sub.f32 	%f502, %f486, %f145;
+	sub.f32 	%f503, %f487, %f145;
+	sub.f32 	%f504, %f488, %f145;
+	sub.f32 	%f505, %f489, %f145;
+	sub.f32 	%f506, %f490, %f145;
+	sub.f32 	%f507, %f491, %f145;
+	sub.f32 	%f508, %f492, %f146;
+	sub.f32 	%f509, %f493, %f146;
+	sub.f32 	%f510, %f494, %f146;
+	sub.f32 	%f511, %f495, %f146;
+	sub.f32 	%f512, %f496, %f146;
+	sub.f32 	%f513, %f497, %f146;
+	sub.f32 	%f514, %f498, %f146;
+	sub.f32 	%f515, %f499, %f146;
+	.loc	1 73 24
+	mul.f32 	%f518, %f500, %f516;
+	mul.f32 	%f519, %f501, %f516;
+	mul.f32 	%f520, %f502, %f516;
+	mul.f32 	%f521, %f503, %f516;
+	mul.f32 	%f522, %f504, %f516;
+	mul.f32 	%f523, %f505, %f516;
+	mul.f32 	%f524, %f506, %f516;
+	mul.f32 	%f525, %f507, %f516;
+	mul.f32 	%f526, %f508, %f517;
+	mul.f32 	%f527, %f509, %f517;
+	mul.f32 	%f528, %f510, %f517;
+	mul.f32 	%f529, %f511, %f517;
+	mul.f32 	%f530, %f512, %f517;
+	mul.f32 	%f531, %f513, %f517;
+	mul.f32 	%f532, %f514, %f517;
+	mul.f32 	%f533, %f515, %f517;
+	.loc	1 74 24
+	mul.f32 	%f534, %f518, %f165;
+	mul.f32 	%f535, %f519, %f166;
+	mul.f32 	%f536, %f520, %f167;
+	mul.f32 	%f537, %f521, %f168;
+	mul.f32 	%f538, %f522, %f169;
+	mul.f32 	%f539, %f523, %f170;
+	mul.f32 	%f540, %f524, %f171;
+	mul.f32 	%f541, %f525, %f172;
+	mul.f32 	%f542, %f526, %f165;
+	mul.f32 	%f543, %f527, %f166;
+	mul.f32 	%f544, %f528, %f167;
+	mul.f32 	%f545, %f529, %f168;
+	mul.f32 	%f546, %f530, %f169;
+	mul.f32 	%f547, %f531, %f170;
+	mul.f32 	%f548, %f532, %f171;
+	mul.f32 	%f549, %f533, %f172;
+	.loc	1 76 35
+	add.s32 	%r394, %r8, %r407;
+	add.s32 	%r395, %r394, 64;
+	.loc	1 76 29
+	add.s32 	%r396, %r394, 8256;
+	mul.wide.s32 	%rd102, %r395, 2;
+	add.s64 	%rd100, %rd13, %rd102;
+	mul.wide.s32 	%rd103, %r396, 2;
+	add.s64 	%rd101, %rd13, %rd103;
+	.loc	1 76 52
+	mov.b32 	%r370, %f534;
+	cvt.rn.bf16.f32 %rs1, %r370;
+	mov.b32 	%r371, %f535;
+	cvt.rn.bf16.f32 %rs2, %r371;
+	mov.b32 	%r372, %f536;
+	cvt.rn.bf16.f32 %rs3, %r372;
+	mov.b32 	%r373, %f537;
+	cvt.rn.bf16.f32 %rs4, %r373;
+	mov.b32 	%r374, %f538;
+	cvt.rn.bf16.f32 %rs5, %r374;
+	mov.b32 	%r375, %f539;
+	cvt.rn.bf16.f32 %rs6, %r375;
+	mov.b32 	%r376, %f540;
+	cvt.rn.bf16.f32 %rs7, %r376;
+	mov.b32 	%r377, %f541;
+	cvt.rn.bf16.f32 %rs8, %r377;
+	mov.b32 	%r378, %f542;
+	cvt.rn.bf16.f32 %rs9, %r378;
+	mov.b32 	%r379, %f543;
+	cvt.rn.bf16.f32 %rs10, %r379;
+	mov.b32 	%r380, %f544;
+	cvt.rn.bf16.f32 %rs11, %r380;
+	mov.b32 	%r381, %f545;
+	cvt.rn.bf16.f32 %rs12, %r381;
+	mov.b32 	%r382, %f546;
+	cvt.rn.bf16.f32 %rs13, %r382;
+	mov.b32 	%r383, %f547;
+	cvt.rn.bf16.f32 %rs14, %r383;
+	mov.b32 	%r384, %f548;
+	cvt.rn.bf16.f32 %rs15, %r384;
+	mov.b32 	%r385, %f549;
+	cvt.rn.bf16.f32 %rs16, %r385;
+	mov.b32 	%r397, {%rs1, %rs2};
+	mov.b32 	%r398, {%rs3, %rs4};
+	mov.b32 	%r399, {%rs5, %rs6};
+	mov.b32 	%r400, {%rs7, %rs8};
+	@%p1 st.global.v4.b32 [ %rd100 + 0 ], { %r397, %r398, %r399, %r400 };
+	mov.b32 	%r401, {%rs9, %rs10};
+	mov.b32 	%r402, {%rs11, %rs12};
+	mov.b32 	%r403, {%rs13, %rs14};
+	mov.b32 	%r404, {%rs15, %rs16};
+	@%p1 st.global.v4.b32 [ %rd101 + 0 ], { %r401, %r402, %r403, %r404 };
+	.loc	1 55 36
+	add.s64 	%rd106, %rd106, 256;
+	add.s32 	%r407, %r407, 64;
+	setp.lt.u32 	%p136, %r407, 192;
+	@%p136 bra 	$L__BB0_5;
+	bra.uni 	$L__BB0_8;
+$L__BB0_5:
+	.loc	1 59 35
+	add.s64 	%rd83, %rd6, %rd106;
+	add.s64 	%rd84, %rd83, 16;
+	add.s64 	%rd85, %rd5, %rd106;
+	.loc	1 59 51
+	add.s64 	%rd86, %rd85, 16;
+	mov.u32 %r289, 0x0;
+	mov.u32 %r290, 0x0;
+	mov.u32 %r291, 0x0;
+	mov.u32 %r292, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r289, %r290, %r291, %r292 }, [ %rd83 + 0 ];
+	@!%p1 mov.u32 %r289, %r342;
+	@!%p1 mov.u32 %r290, %r342;
+	@!%p1 mov.u32 %r291, %r342;
+	@!%p1 mov.u32 %r292, %r342;
+	mov.b32 	%f149, %r289;
+	mov.b32 	%f150, %r290;
+	mov.b32 	%f151, %r291;
+	mov.b32 	%f152, %r292;
+	mov.u32 %r297, 0x0;
+	mov.u32 %r298, 0x0;
+	mov.u32 %r299, 0x0;
+	mov.u32 %r300, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r297, %r298, %r299, %r300 }, [ %rd84 + 0 ];
+	@!%p1 mov.u32 %r297, %r342;
+	@!%p1 mov.u32 %r298, %r342;
+	@!%p1 mov.u32 %r299, %r342;
+	@!%p1 mov.u32 %r300, %r342;
+	mov.b32 	%f153, %r297;
+	mov.b32 	%f154, %r298;
+	mov.b32 	%f155, %r299;
+	mov.b32 	%f156, %r300;
+	mov.u32 %r305, 0x0;
+	mov.u32 %r306, 0x0;
+	mov.u32 %r307, 0x0;
+	mov.u32 %r308, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r305, %r306, %r307, %r308 }, [ %rd85 + 0 ];
+	@!%p1 mov.u32 %r305, %r342;
+	@!%p1 mov.u32 %r306, %r342;
+	@!%p1 mov.u32 %r307, %r342;
+	@!%p1 mov.u32 %r308, %r342;
+	mov.b32 	%f157, %r305;
+	mov.b32 	%f158, %r306;
+	mov.b32 	%f159, %r307;
+	mov.b32 	%f160, %r308;
+	mov.u32 %r313, 0x0;
+	mov.u32 %r314, 0x0;
+	mov.u32 %r315, 0x0;
+	mov.u32 %r316, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r313, %r314, %r315, %r316 }, [ %rd86 + 0 ];
+	@!%p1 mov.u32 %r313, %r342;
+	@!%p1 mov.u32 %r314, %r342;
+	@!%p1 mov.u32 %r315, %r342;
+	@!%p1 mov.u32 %r316, %r342;
+	mov.b32 	%f161, %r313;
+	mov.b32 	%f162, %r314;
+	mov.b32 	%f163, %r315;
+	mov.b32 	%f164, %r316;
+	.loc	1 60 35
+	add.s64 	%rd87, %rd9, %rd106;
+	.loc	1 60 40
+	add.s64 	%rd88, %rd87, 16;
+	mov.u32 %r321, 0x0;
+	mov.u32 %r322, 0x0;
+	mov.u32 %r323, 0x0;
+	mov.u32 %r324, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r321, %r322, %r323, %r324 }, [ %rd87 + 0 ];
+	@!%p1 mov.u32 %r321, %r342;
+	@!%p1 mov.u32 %r322, %r342;
+	@!%p1 mov.u32 %r323, %r342;
+	@!%p1 mov.u32 %r324, %r342;
+	mov.b32 	%f165, %r321;
+	mov.b32 	%f166, %r322;
+	mov.b32 	%f167, %r323;
+	mov.b32 	%f168, %r324;
+	mov.u32 %r329, 0x0;
+	mov.u32 %r330, 0x0;
+	mov.u32 %r331, 0x0;
+	mov.u32 %r332, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r329, %r330, %r331, %r332 }, [ %rd88 + 0 ];
+	@!%p1 mov.u32 %r329, %r342;
+	@!%p1 mov.u32 %r330, %r342;
+	@!%p1 mov.u32 %r331, %r342;
+	@!%p1 mov.u32 %r332, %r342;
+	mov.b32 	%f169, %r329;
+	mov.b32 	%f170, %r330;
+	mov.b32 	%f171, %r331;
+	mov.b32 	%f172, %r332;
+	.loc	1 64 57
+	@%p41 bra 	$L__BB0_7;
+	mov.u64 	%rd89, assertMessage_1;
+	cvta.global.u64 	%rd90, %rd89;
+	mov.u64 	%rd91, assertFile_1;
+	cvta.global.u64 	%rd92, %rd91;
+	mov.u64 	%rd93, assertFunc_1;
+	cvta.global.u64 	%rd94, %rd93;
+	{ // callseq 7, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd90;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd92;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r405;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd94;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd104;
+	call.uni
+	__assertfail,
+	(
+	param0,
+	param1,
+	param2,
+	param3,
+	param4
+	);
+	} // callseq 7
+	bra.uni 	$L__BB0_7;
+$L__BB0_8:
+	.loc	1 55 4
+	ret;
+$L__tmp17:
+$L__func_end0:
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+}
+	.file	1 "/tmp/torchinductor_root/lh/clhe4a3stvufxafmq3kk5hodazz2efctffte646znjdnv3lqi5oa.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 298
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 108
+.b8 104
+.b8 101
+.b8 52
+.b8 97
+.b8 51
+.b8 115
+.b8 116
+.b8 118
+.b8 117
+.b8 102
+.b8 120
+.b8 97
+.b8 102
+.b8 109
+.b8 113
+.b8 51
+.b8 107
+.b8 107
+.b8 53
+.b8 104
+.b8 111
+.b8 100
+.b8 97
+.b8 122
+.b8 122
+.b8 50
+.b8 101
+.b8 102
+.b8 99
+.b8 116
+.b8 102
+.b8 102
+.b8 116
+.b8 101
+.b8 54
+.b8 52
+.b8 54
+.b8 122
+.b8 110
+.b8 106
+.b8 100
+.b8 110
+.b8 118
+.b8 51
+.b8 108
+.b8 113
+.b8 105
+.b8 53
+.b8 111
+.b8 97
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 108
+.b8 104
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp2
+.b8 2
+.b8 44
+.b8 38
+.b8 5
+.b32 125
+.b64 $L__tmp3
+.b64 $L__tmp16
+.b8 2
+.b8 50
+.b8 41
+.b8 4
+.b32 125
+.b64 $L__tmp3
+.b64 $L__tmp16
+.b8 2
+.b8 120
+.b8 46
+.b8 0
+.b8 4
+.b32 125
+.b64 $L__tmp4
+.b64 $L__tmp15
+.b8 2
+.b8 50
+.b8 41
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 302
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 302
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}

.triton/dump/510522bb05917b836ed253751364fcad/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,153 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<512> : tensor<64x1xi32, #blocked>
+    %cst_0 = arith.constant dense<256> : tensor<1x64xi32, #blocked>
+    %cst_1 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked>
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x64xf32, #blocked>
+    %cst_4 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked>
+    %cst_5 = arith.constant dense<256> : tensor<64x1xi64, #blocked>
+    %cst_6 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
+    %cst_7 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
+    %cst_8 = arith.constant dense<50257> : tensor<64x1xi64, #blocked1>
+    %cst_9 = arith.constant dense<0> : tensor<64x1xi64, #blocked1>
+    %c0_i32 = arith.constant 0 : i32
+    %c64_i32 = arith.constant 64 : i32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_10 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked2>
+    %cst_11 = arith.constant 0.000000e+00 : f32
+    %cst_12 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked2>
+    %cst_13 = arith.constant dense<256> : tensor<1x64xi32, #blocked2>
+    %cst_14 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32, #blocked>
+    %cst_15 = arith.constant dense<2.560000e+02> : tensor<64x1xf32, #blocked>
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c64_i32 : i32
+    %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
+    %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
+    %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
+    %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
+    %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
+    %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
+    %10 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
+    %11 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
+    %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x64xi32, #blocked>
+    %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x64xi32, #blocked2>
+    %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
+    %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked1>
+    %16 = tt.addptr %14, %8 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
+    %17 = tt.addptr %15, %9 : tensor<64x1x!tt.ptr<i64, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
+    %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
+    %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked1>
+    %20 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
+    %21 = arith.muli %20, %cst_1 : tensor<64x1xi32, #blocked>
+    %22 = tt.broadcast %21 : (tensor<64x1xi32, #blocked>) -> tensor<64x64xi32, #blocked>
+    %23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>, #blocked>
+    %24 = arith.addi %18, %cst_7 : tensor<64x1xi64, #blocked>
+    %25 = arith.addi %19, %cst_8 : tensor<64x1xi64, #blocked1>
+    %26 = arith.cmpi slt, %18, %cst_6 : tensor<64x1xi64, #blocked>
+    %27 = arith.cmpi slt, %19, %cst_9 : tensor<64x1xi64, #blocked1>
+    %28 = arith.select %26, %24, %18 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
+    %29 = arith.select %27, %25, %19 : tensor<64x1xi1, #blocked1>, tensor<64x1xi64, #blocked1>
+    %30 = arith.cmpi sge, %29, %cst_9 : tensor<64x1xi64, #blocked1>
+    %31 = arith.cmpi slt, %29, %cst_8 : tensor<64x1xi64, #blocked1>
+    %32 = arith.andi %30, %31 : tensor<64x1xi1, #blocked1>
+    %33 = arith.muli %28, %cst_5 : tensor<64x1xi64, #blocked>
+    %34 = tt.broadcast %33 : (tensor<64x1xi64, #blocked>) -> tensor<64x64xi64, #blocked>
+    %35 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>, #blocked>
+    %36:4 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c64_i32 iter_args(%arg8 = %cst_2, %arg9 = %cst_2, %arg10 = %cst_12, %arg11 = %cst_2) -> (tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked>)  : i32 {
+      %48 = tt.splat %arg7 : (i32) -> tensor<1x64xi32, #blocked>
+      %49 = tt.splat %arg7 : (i32) -> tensor<1x64xi32, #blocked2>
+      %50 = arith.addi %48, %12 : tensor<1x64xi32, #blocked>
+      %51 = arith.addi %49, %13 : tensor<1x64xi32, #blocked2>
+      %52 = arith.cmpi slt, %50, %cst_0 : tensor<1x64xi32, #blocked>
+      %53 = arith.cmpi slt, %51, %cst_13 : tensor<1x64xi32, #blocked2>
+      %54 = tt.broadcast %50 : (tensor<1x64xi32, #blocked>) -> tensor<64x64xi32, #blocked>
+      %55 = arith.addi %54, %22 : tensor<64x64xi32, #blocked>
+      %56 = tt.addptr %23, %55 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi32, #blocked>
+      %57 = tt.broadcast %52 : (tensor<1x64xi1, #blocked>) -> tensor<64x64xi1, #blocked>
+      %58 = tt.broadcast %53 : (tensor<1x64xi1, #blocked2>) -> tensor<64x64xi1, #blocked2>
+      %59 = tt.load %56, %57, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
+      tt.assert %32, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
+      %60 = arith.extsi %50 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked>
+      %61 = tt.broadcast %60 : (tensor<1x64xi64, #blocked>) -> tensor<64x64xi64, #blocked>
+      %62 = arith.addi %61, %34 : tensor<64x64xi64, #blocked>
+      %63 = tt.addptr %35, %62 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi64, #blocked>
+      %64 = tt.load %63, %57, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
+      %65 = arith.addf %64, %59 : tensor<64x64xf32, #blocked>
+      %66 = arith.subf %65, %arg8 : tensor<64x64xf32, #blocked>
+      %67 = arith.addf %arg11, %cst_4 : tensor<64x64xf32, #blocked>
+      %68 = arith.addf %arg10, %cst_10 : tensor<64x64xf32, #blocked2>
+      %69 = arith.divf %66, %67 : tensor<64x64xf32, #blocked>
+      %70 = arith.addf %arg8, %69 : tensor<64x64xf32, #blocked>
+      %71 = arith.subf %65, %70 : tensor<64x64xf32, #blocked>
+      %72 = arith.mulf %66, %71 : tensor<64x64xf32, #blocked>
+      %73 = arith.addf %arg9, %72 : tensor<64x64xf32, #blocked>
+      %74 = arith.select %57, %70, %arg8 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
+      %75 = arith.select %57, %73, %arg9 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
+      %76 = arith.select %57, %67, %arg11 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
+      %77 = arith.select %58, %68, %arg10 : tensor<64x64xi1, #blocked2>, tensor<64x64xf32, #blocked2>
+      scf.yield %74, %75, %77, %76 : tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked>
+    }
+    %37 = triton_gpu.convert_layout %36#2 : (tensor<64x64xf32, #blocked2>) -> tensor<64x64xf32, #blocked>
+    %38:3 = "tt.reduce"(%36#0, %36#1, %37) <{axis = 1 : i32}> ({
+    ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
+      %48 = arith.subf %arg10, %arg7 : f32
+      %49 = arith.addf %arg9, %arg12 : f32
+      %50 = arith.cmpf oeq, %49, %cst_11 : f32
+      %51 = arith.divf %arg12, %49 : f32
+      %52 = arith.select %50, %cst_11, %51 : f32
+      %53 = arith.mulf %48, %52 : f32
+      %54 = arith.addf %arg7, %53 : f32
+      %55 = arith.addf %arg8, %arg11 : f32
+      %56 = arith.mulf %48, %48 : f32
+      %57 = arith.mulf %56, %arg9 : f32
+      %58 = arith.mulf %57, %52 : f32
+      %59 = arith.addf %55, %58 : f32
+      tt.reduce.return %54, %59, %49 : f32, f32, f32
+    }) : (tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
+    %39 = tt.expand_dims %38#0 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
+    %40 = tt.expand_dims %38#1 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
+    %41 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x64x!tt.ptr<f32, 1>, #blocked>
+    %42 = tt.broadcast %39 : (tensor<64x1xf32, #blocked>) -> tensor<64x64xf32, #blocked>
+    %43 = arith.divf %40, %cst_15 : tensor<64x1xf32, #blocked>
+    %44 = arith.addf %43, %cst_14 : tensor<64x1xf32, #blocked>
+    %45 = arith.muli %8, %cst_1 : tensor<64x1xi32, #blocked>
+    %46 = tt.broadcast %45 : (tensor<64x1xi32, #blocked>) -> tensor<64x64xi32, #blocked>
+    %47 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>, #blocked>
+    scf.for %arg7 = %c0_i32 to %c256_i32 step %c64_i32  : i32 {
+      %48 = tt.splat %arg7 : (i32) -> tensor<1x64xi32, #blocked>
+      %49 = arith.addi %48, %12 : tensor<1x64xi32, #blocked>
+      %50 = arith.cmpi slt, %49, %cst_0 : tensor<1x64xi32, #blocked>
+      %51 = tt.broadcast %49 : (tensor<1x64xi32, #blocked>) -> tensor<64x64xi32, #blocked>
+      %52 = arith.addi %51, %22 : tensor<64x64xi32, #blocked>
+      %53 = tt.addptr %23, %52 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi32, #blocked>
+      %54 = tt.broadcast %50 : (tensor<1x64xi1, #blocked>) -> tensor<64x64xi1, #blocked>
+      %55 = tt.load %53, %54, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
+      %56 = tt.addptr %41, %49 : tensor<1x64x!tt.ptr<f32, 1>, #blocked>, tensor<1x64xi32, #blocked>
+      %57 = tt.load %56, %50, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x64xf32, #blocked>
+      tt.assert %32, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
+      %58 = arith.extsi %49 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked>
+      %59 = tt.broadcast %58 : (tensor<1x64xi64, #blocked>) -> tensor<64x64xi64, #blocked>
+      %60 = arith.addi %59, %34 : tensor<64x64xi64, #blocked>
+      %61 = tt.addptr %35, %60 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi64, #blocked>
+      %62 = tt.load %61, %54, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
+      %63 = arith.addf %62, %55 : tensor<64x64xf32, #blocked>
+      %64 = arith.subf %63, %42 : tensor<64x64xf32, #blocked>
+      %65 = tt.extern_elementwise %44 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked>) -> tensor<64x1xf32, #blocked>
+      %66 = tt.broadcast %65 : (tensor<64x1xf32, #blocked>) -> tensor<64x64xf32, #blocked>
+      %67 = arith.mulf %64, %66 : tensor<64x64xf32, #blocked>
+      %68 = tt.broadcast %57 : (tensor<1x64xf32, #blocked>) -> tensor<64x64xf32, #blocked>
+      %69 = arith.mulf %67, %68 : tensor<64x64xf32, #blocked>
+      %70 = arith.addi %51, %46 : tensor<64x64xi32, #blocked>
+      %71 = tt.addptr %47, %70 : tensor<64x64x!tt.ptr<bf16, 1>, #blocked>, tensor<64x64xi32, #blocked>
+      %72 = arith.truncf %69 : tensor<64x64xf32, #blocked> to tensor<64x64xbf16, #blocked>
+      tt.store %71, %72, %54 {cache = 1 : i32, evict = 1 : i32} : tensor<64x64xbf16, #blocked>
+    }
+    tt.return
+  }
+}

.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.cubin ADDED Viewed

Binary file (73.7 kB). View file

.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.llir ADDED Viewed

	@@ -0,0 +1,1360 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
+@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
+define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %10 = lshr i32 %9, 3, !dbg !10
+  %11 = and i32 %10, 31, !dbg !10
+  %12 = and i32 %9, 63, !dbg !10
+  %13 = shl i32 %9, 3, !dbg !11
+  %14 = and i32 %13, 56, !dbg !11
+  %15 = or i32 %14, 4, !dbg !11
+  %16 = lshr i32 %9, 6, !dbg !12
+  %17 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !13
+  %18 = shl i32 %17, 6, !dbg !14
+  %19 = or i32 %18, %11, !dbg !15
+  %20 = or i32 %19, 32, !dbg !15
+  %21 = or i32 %18, %12, !dbg !15
+  %22 = sext i32 %19 to i64, !dbg !16
+  %23 = getelementptr i64, ptr addrspace(1) %0, i64 %22, !dbg !16
+  %24 = sext i32 %20 to i64, !dbg !16
+  %25 = getelementptr i64, ptr addrspace(1) %0, i64 %24, !dbg !16
+  %26 = sext i32 %21 to i64, !dbg !16
+  %27 = getelementptr i64, ptr addrspace(1) %0, i64 %26, !dbg !16
+  %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17
+  %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17
+  %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17
+  %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17
+  %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17
+  %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17
+  %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17
+  %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17
+  %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
+  %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
+  %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
+  %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
+  %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
+  %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
+  %42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
+  %43 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
+  %44 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %27, i1 true) #6, !dbg !17
+  %45 = srem i32 %19, 512, !dbg !18
+  %46 = srem i32 %20, 512, !dbg !18
+  %47 = shl nsw i32 %45, 8, !dbg !19
+  %48 = shl nsw i32 %46, 8, !dbg !19
+  %49 = shl i32 %19, 8, !dbg !20
+  %50 = shl i32 %20, 8, !dbg !20
+  %51 = add i64 %44, 50257, !dbg !21
+  %52 = icmp slt i64 %28, 0, !dbg !22
+  %53 = icmp slt i64 %36, 0, !dbg !22
+  %54 = icmp slt i64 %44, 0, !dbg !22
+  %55 = select i1 %54, i64 %51, i64 %44, !dbg !23
+  %56 = icmp ugt i64 %55, 50256, !dbg !24
+  %57 = shl i64 %28, 8, !dbg !25
+  %58 = add i64 %57, 12865792, !dbg !25
+  %59 = select i1 %52, i64 %58, i64 %57, !dbg !25
+  %60 = shl i64 %36, 8, !dbg !25
+  %61 = add i64 %60, 12865792, !dbg !25
+  %62 = select i1 %53, i64 %61, i64 %60, !dbg !25
+  %63 = getelementptr float, ptr addrspace(1) %1, i64 %59
+  %64 = getelementptr float, ptr addrspace(1) %1, i64 %62
+  br label %65, !dbg !12
+65:                                               ; preds = %8, %230
+  %66 = phi float [ 0.000000e+00, %8 ], [ %321, %230 ]
+  %67 = phi float [ 0.000000e+00, %8 ], [ %322, %230 ]
+  %68 = phi float [ 0.000000e+00, %8 ], [ %323, %230 ]
+  %69 = phi float [ 0.000000e+00, %8 ], [ %324, %230 ]
+  %70 = phi float [ 0.000000e+00, %8 ], [ %325, %230 ]
+  %71 = phi float [ 0.000000e+00, %8 ], [ %326, %230 ]
+  %72 = phi float [ 0.000000e+00, %8 ], [ %327, %230 ]
+  %73 = phi float [ 0.000000e+00, %8 ], [ %328, %230 ]
+  %74 = phi float [ 0.000000e+00, %8 ], [ %329, %230 ]
+  %75 = phi float [ 0.000000e+00, %8 ], [ %330, %230 ]
+  %76 = phi float [ 0.000000e+00, %8 ], [ %331, %230 ]
+  %77 = phi float [ 0.000000e+00, %8 ], [ %332, %230 ]
+  %78 = phi float [ 0.000000e+00, %8 ], [ %333, %230 ]
+  %79 = phi float [ 0.000000e+00, %8 ], [ %334, %230 ]
+  %80 = phi float [ 0.000000e+00, %8 ], [ %335, %230 ]
+  %81 = phi float [ 0.000000e+00, %8 ], [ %336, %230 ]
+  %82 = phi float [ 0.000000e+00, %8 ], [ %337, %230 ]
+  %83 = phi float [ 0.000000e+00, %8 ], [ %338, %230 ]
+  %84 = phi float [ 0.000000e+00, %8 ], [ %339, %230 ]
+  %85 = phi float [ 0.000000e+00, %8 ], [ %340, %230 ]
+  %86 = phi float [ 0.000000e+00, %8 ], [ %341, %230 ]
+  %87 = phi float [ 0.000000e+00, %8 ], [ %342, %230 ]
+  %88 = phi float [ 0.000000e+00, %8 ], [ %343, %230 ]
+  %89 = phi float [ 0.000000e+00, %8 ], [ %344, %230 ]
+  %90 = phi float [ 0.000000e+00, %8 ], [ %345, %230 ]
+  %91 = phi float [ 0.000000e+00, %8 ], [ %346, %230 ]
+  %92 = phi float [ 0.000000e+00, %8 ], [ %347, %230 ]
+  %93 = phi float [ 0.000000e+00, %8 ], [ %348, %230 ]
+  %94 = phi float [ 0.000000e+00, %8 ], [ %349, %230 ]
+  %95 = phi float [ 0.000000e+00, %8 ], [ %350, %230 ]
+  %96 = phi float [ 0.000000e+00, %8 ], [ %351, %230 ]
+  %97 = phi float [ 0.000000e+00, %8 ], [ %352, %230 ]
+  %98 = phi float [ 0.000000e+00, %8 ], [ %417, %230 ]
+  %99 = phi float [ 0.000000e+00, %8 ], [ %418, %230 ]
+  %100 = phi float [ 0.000000e+00, %8 ], [ %419, %230 ]
+  %101 = phi float [ 0.000000e+00, %8 ], [ %420, %230 ]
+  %102 = phi float [ 0.000000e+00, %8 ], [ %421, %230 ]
+  %103 = phi float [ 0.000000e+00, %8 ], [ %422, %230 ]
+  %104 = phi float [ 0.000000e+00, %8 ], [ %423, %230 ]
+  %105 = phi float [ 0.000000e+00, %8 ], [ %424, %230 ]
+  %106 = phi float [ 0.000000e+00, %8 ], [ %425, %230 ]
+  %107 = phi float [ 0.000000e+00, %8 ], [ %426, %230 ]
+  %108 = phi float [ 0.000000e+00, %8 ], [ %427, %230 ]
+  %109 = phi float [ 0.000000e+00, %8 ], [ %428, %230 ]
+  %110 = phi float [ 0.000000e+00, %8 ], [ %429, %230 ]
+  %111 = phi float [ 0.000000e+00, %8 ], [ %430, %230 ]
+  %112 = phi float [ 0.000000e+00, %8 ], [ %431, %230 ]
+  %113 = phi float [ 0.000000e+00, %8 ], [ %432, %230 ]
+  %114 = phi float [ 0.000000e+00, %8 ], [ %369, %230 ]
+  %115 = phi float [ 0.000000e+00, %8 ], [ %370, %230 ]
+  %116 = phi float [ 0.000000e+00, %8 ], [ %371, %230 ]
+  %117 = phi float [ 0.000000e+00, %8 ], [ %372, %230 ]
+  %118 = phi float [ 0.000000e+00, %8 ], [ %373, %230 ]
+  %119 = phi float [ 0.000000e+00, %8 ], [ %374, %230 ]
+  %120 = phi float [ 0.000000e+00, %8 ], [ %375, %230 ]
+  %121 = phi float [ 0.000000e+00, %8 ], [ %376, %230 ]
+  %122 = phi float [ 0.000000e+00, %8 ], [ %377, %230 ]
+  %123 = phi float [ 0.000000e+00, %8 ], [ %378, %230 ]
+  %124 = phi float [ 0.000000e+00, %8 ], [ %379, %230 ]
+  %125 = phi float [ 0.000000e+00, %8 ], [ %380, %230 ]
+  %126 = phi float [ 0.000000e+00, %8 ], [ %381, %230 ]
+  %127 = phi float [ 0.000000e+00, %8 ], [ %382, %230 ]
+  %128 = phi float [ 0.000000e+00, %8 ], [ %383, %230 ]
+  %129 = phi float [ 0.000000e+00, %8 ], [ %384, %230 ]
+  %130 = phi i32 [ 0, %8 ], [ %433, %230 ]
+  %131 = or i32 %130, %14, !dbg !26
+  %132 = or i32 %130, %15, !dbg !26
+  %133 = add i32 %131, %47, !dbg !27
+  %134 = add i32 %132, %47, !dbg !27
+  %135 = add i32 %131, %48, !dbg !27
+  %136 = add i32 %132, %48, !dbg !27
+  %137 = sext i32 %133 to i64, !dbg !28
+  %138 = getelementptr float, ptr addrspace(1) %2, i64 %137, !dbg !28
+  %139 = sext i32 %134 to i64, !dbg !28
+  %140 = getelementptr float, ptr addrspace(1) %2, i64 %139, !dbg !28
+  %141 = sext i32 %135 to i64, !dbg !28
+  %142 = getelementptr float, ptr addrspace(1) %2, i64 %141, !dbg !28
+  %143 = sext i32 %136 to i64, !dbg !28
+  %144 = getelementptr float, ptr addrspace(1) %2, i64 %143, !dbg !28
+  %145 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %138, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
+  %146 = extractvalue { i32, i32, i32, i32 } %145, 0, !dbg !29
+  %147 = extractvalue { i32, i32, i32, i32 } %145, 1, !dbg !29
+  %148 = extractvalue { i32, i32, i32, i32 } %145, 2, !dbg !29
+  %149 = extractvalue { i32, i32, i32, i32 } %145, 3, !dbg !29
+  %150 = bitcast i32 %146 to float, !dbg !29
+  %151 = bitcast i32 %147 to float, !dbg !29
+  %152 = bitcast i32 %148 to float, !dbg !29
+  %153 = bitcast i32 %149 to float, !dbg !29
+  %154 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %140, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
+  %155 = extractvalue { i32, i32, i32, i32 } %154, 0, !dbg !29
+  %156 = extractvalue { i32, i32, i32, i32 } %154, 1, !dbg !29
+  %157 = extractvalue { i32, i32, i32, i32 } %154, 2, !dbg !29
+  %158 = extractvalue { i32, i32, i32, i32 } %154, 3, !dbg !29
+  %159 = bitcast i32 %155 to float, !dbg !29
+  %160 = bitcast i32 %156 to float, !dbg !29
+  %161 = bitcast i32 %157 to float, !dbg !29
+  %162 = bitcast i32 %158 to float, !dbg !29
+  %163 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %142, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
+  %164 = extractvalue { i32, i32, i32, i32 } %163, 0, !dbg !29
+  %165 = extractvalue { i32, i32, i32, i32 } %163, 1, !dbg !29
+  %166 = extractvalue { i32, i32, i32, i32 } %163, 2, !dbg !29
+  %167 = extractvalue { i32, i32, i32, i32 } %163, 3, !dbg !29
+  %168 = bitcast i32 %164 to float, !dbg !29
+  %169 = bitcast i32 %165 to float, !dbg !29
+  %170 = bitcast i32 %166 to float, !dbg !29
+  %171 = bitcast i32 %167 to float, !dbg !29
+  %172 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %144, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
+  %173 = extractvalue { i32, i32, i32, i32 } %172, 0, !dbg !29
+  %174 = extractvalue { i32, i32, i32, i32 } %172, 1, !dbg !29
+  %175 = extractvalue { i32, i32, i32, i32 } %172, 2, !dbg !29
+  %176 = extractvalue { i32, i32, i32, i32 } %172, 3, !dbg !29
+  %177 = bitcast i32 %173 to float, !dbg !29
+  %178 = bitcast i32 %174 to float, !dbg !29
+  %179 = bitcast i32 %175 to float, !dbg !29
+  %180 = bitcast i32 %176 to float, !dbg !29
+  %181 = add i32 %131, %49, !dbg !30
+  %182 = add i32 %131, %50, !dbg !30
+  %183 = sext i32 %181 to i64, !dbg !31
+  %184 = getelementptr i16, ptr addrspace(1) %3, i64 %183, !dbg !31
+  %185 = sext i32 %182 to i64, !dbg !31
+  %186 = getelementptr i16, ptr addrspace(1) %3, i64 %185, !dbg !31
+  %187 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %184, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
+  %188 = extractvalue { i32, i32, i32, i32 } %187, 0, !dbg !32
+  %189 = extractvalue { i32, i32, i32, i32 } %187, 1, !dbg !32
+  %190 = extractvalue { i32, i32, i32, i32 } %187, 2, !dbg !32
+  %191 = extractvalue { i32, i32, i32, i32 } %187, 3, !dbg !32
+  %192 = trunc i32 %188 to i16, !dbg !32
+  %extelt.offset9 = lshr i32 %188, 16, !dbg !32
+  %193 = trunc i32 %extelt.offset9 to i16, !dbg !32
+  %194 = trunc i32 %189 to i16, !dbg !32
+  %extelt.offset10 = lshr i32 %189, 16, !dbg !32
+  %195 = trunc i32 %extelt.offset10 to i16, !dbg !32
+  %196 = trunc i32 %190 to i16, !dbg !32
+  %extelt.offset11 = lshr i32 %190, 16, !dbg !32
+  %197 = trunc i32 %extelt.offset11 to i16, !dbg !32
+  %198 = trunc i32 %191 to i16, !dbg !32
+  %extelt.offset12 = lshr i32 %191, 16, !dbg !32
+  %199 = trunc i32 %extelt.offset12 to i16, !dbg !32
+  %200 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %186, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
+  %201 = extractvalue { i32, i32, i32, i32 } %200, 0, !dbg !32
+  %202 = extractvalue { i32, i32, i32, i32 } %200, 1, !dbg !32
+  %203 = extractvalue { i32, i32, i32, i32 } %200, 2, !dbg !32
+  %204 = extractvalue { i32, i32, i32, i32 } %200, 3, !dbg !32
+  %205 = trunc i32 %201 to i16, !dbg !32
+  %extelt.offset13 = lshr i32 %201, 16, !dbg !32
+  %206 = trunc i32 %extelt.offset13 to i16, !dbg !32
+  %207 = trunc i32 %202 to i16, !dbg !32
+  %extelt.offset14 = lshr i32 %202, 16, !dbg !32
+  %208 = trunc i32 %extelt.offset14 to i16, !dbg !32
+  %209 = trunc i32 %203 to i16, !dbg !32
+  %extelt.offset15 = lshr i32 %203, 16, !dbg !32
+  %210 = trunc i32 %extelt.offset15 to i16, !dbg !32
+  %211 = trunc i32 %204 to i16, !dbg !32
+  %extelt.offset16 = lshr i32 %204, 16, !dbg !32
+  %212 = trunc i32 %extelt.offset16 to i16, !dbg !32
+  %213 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %192) #6, !dbg !33
+  %214 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %193) #6, !dbg !33
+  %215 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %194) #6, !dbg !33
+  %216 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %195) #6, !dbg !33
+  %217 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %196) #6, !dbg !33
+  %218 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %197) #6, !dbg !33
+  %219 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %198) #6, !dbg !33
+  %220 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %199) #6, !dbg !33
+  %221 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %205) #6, !dbg !33
+  %222 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %206) #6, !dbg !33
+  %223 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %207) #6, !dbg !33
+  %224 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %208) #6, !dbg !33
+  %225 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %209) #6, !dbg !33
+  %226 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %210) #6, !dbg !33
+  %227 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %211) #6, !dbg !33
+  %228 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %212) #6, !dbg !33
+  br i1 %56, label %229, label %230, !dbg !34
+229:                                              ; preds = %65
+  tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !34
+  br label %230, !dbg !34
+230:                                              ; preds = %229, %65
+  %231 = zext nneg i32 %131 to i64, !dbg !35
+  %232 = zext nneg i32 %132 to i64, !dbg !35
+  %233 = getelementptr float, ptr addrspace(1) %63, i64 %231, !dbg !36
+  %234 = getelementptr float, ptr addrspace(1) %63, i64 %232, !dbg !36
+  %235 = getelementptr float, ptr addrspace(1) %64, i64 %231, !dbg !36
+  %236 = getelementptr float, ptr addrspace(1) %64, i64 %232, !dbg !36
+  %237 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %233, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37
+  %238 = extractvalue { i32, i32, i32, i32 } %237, 0, !dbg !37
+  %239 = extractvalue { i32, i32, i32, i32 } %237, 1, !dbg !37
+  %240 = extractvalue { i32, i32, i32, i32 } %237, 2, !dbg !37
+  %241 = extractvalue { i32, i32, i32, i32 } %237, 3, !dbg !37
+  %242 = bitcast i32 %238 to float, !dbg !37
+  %243 = bitcast i32 %239 to float, !dbg !37
+  %244 = bitcast i32 %240 to float, !dbg !37
+  %245 = bitcast i32 %241 to float, !dbg !37
+  %246 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %234, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37
+  %247 = extractvalue { i32, i32, i32, i32 } %246, 0, !dbg !37
+  %248 = extractvalue { i32, i32, i32, i32 } %246, 1, !dbg !37
+  %249 = extractvalue { i32, i32, i32, i32 } %246, 2, !dbg !37
+  %250 = extractvalue { i32, i32, i32, i32 } %246, 3, !dbg !37
+  %251 = bitcast i32 %247 to float, !dbg !37
+  %252 = bitcast i32 %248 to float, !dbg !37
+  %253 = bitcast i32 %249 to float, !dbg !37
+  %254 = bitcast i32 %250 to float, !dbg !37
+  %255 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %235, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37
+  %256 = extractvalue { i32, i32, i32, i32 } %255, 0, !dbg !37
+  %257 = extractvalue { i32, i32, i32, i32 } %255, 1, !dbg !37
+  %258 = extractvalue { i32, i32, i32, i32 } %255, 2, !dbg !37
+  %259 = extractvalue { i32, i32, i32, i32 } %255, 3, !dbg !37
+  %260 = bitcast i32 %256 to float, !dbg !37
+  %261 = bitcast i32 %257 to float, !dbg !37
+  %262 = bitcast i32 %258 to float, !dbg !37
+  %263 = bitcast i32 %259 to float, !dbg !37
+  %264 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %236, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37
+  %265 = extractvalue { i32, i32, i32, i32 } %264, 0, !dbg !37
+  %266 = extractvalue { i32, i32, i32, i32 } %264, 1, !dbg !37
+  %267 = extractvalue { i32, i32, i32, i32 } %264, 2, !dbg !37
+  %268 = extractvalue { i32, i32, i32, i32 } %264, 3, !dbg !37
+  %269 = bitcast i32 %265 to float, !dbg !37
+  %270 = bitcast i32 %266 to float, !dbg !37
+  %271 = bitcast i32 %267 to float, !dbg !37
+  %272 = bitcast i32 %268 to float, !dbg !37
+  %273 = fadd float %150, %242, !dbg !38
+  %274 = fadd float %151, %243, !dbg !38
+  %275 = fadd float %152, %244, !dbg !38
+  %276 = fadd float %153, %245, !dbg !38
+  %277 = fadd float %159, %251, !dbg !38
+  %278 = fadd float %160, %252, !dbg !38
+  %279 = fadd float %161, %253, !dbg !38
+  %280 = fadd float %162, %254, !dbg !38
+  %281 = fadd float %168, %260, !dbg !38
+  %282 = fadd float %169, %261, !dbg !38
+  %283 = fadd float %170, %262, !dbg !38
+  %284 = fadd float %171, %263, !dbg !38
+  %285 = fadd float %177, %269, !dbg !38
+  %286 = fadd float %178, %270, !dbg !38
+  %287 = fadd float %179, %271, !dbg !38
+  %288 = fadd float %180, %272, !dbg !38
+  %289 = fadd float %213, %273, !dbg !39
+  %290 = fadd float %214, %274, !dbg !39
+  %291 = fadd float %215, %275, !dbg !39
+  %292 = fadd float %216, %276, !dbg !39
+  %293 = fadd float %217, %277, !dbg !39
+  %294 = fadd float %218, %278, !dbg !39
+  %295 = fadd float %219, %279, !dbg !39
+  %296 = fadd float %220, %280, !dbg !39
+  %297 = fadd float %221, %281, !dbg !39
+  %298 = fadd float %222, %282, !dbg !39
+  %299 = fadd float %223, %283, !dbg !39
+  %300 = fadd float %224, %284, !dbg !39
+  %301 = fadd float %225, %285, !dbg !39
+  %302 = fadd float %226, %286, !dbg !39
+  %303 = fadd float %227, %287, !dbg !39
+  %304 = fadd float %228, %288, !dbg !39
+  %305 = fsub float %289, %114, !dbg !40
+  %306 = fsub float %290, %115, !dbg !40
+  %307 = fsub float %291, %116, !dbg !40
+  %308 = fsub float %292, %117, !dbg !40
+  %309 = fsub float %293, %118, !dbg !40
+  %310 = fsub float %294, %119, !dbg !40
+  %311 = fsub float %295, %120, !dbg !40
+  %312 = fsub float %296, %121, !dbg !40
+  %313 = fsub float %297, %122, !dbg !40
+  %314 = fsub float %298, %123, !dbg !40
+  %315 = fsub float %299, %124, !dbg !40
+  %316 = fsub float %300, %125, !dbg !40
+  %317 = fsub float %301, %126, !dbg !40
+  %318 = fsub float %302, %127, !dbg !40
+  %319 = fsub float %303, %128, !dbg !40
+  %320 = fsub float %304, %129, !dbg !40
+  %321 = fadd float %66, 1.000000e+00, !dbg !44
+  %322 = fadd float %67, 1.000000e+00, !dbg !44
+  %323 = fadd float %68, 1.000000e+00, !dbg !44
+  %324 = fadd float %69, 1.000000e+00, !dbg !44
+  %325 = fadd float %70, 1.000000e+00, !dbg !44
+  %326 = fadd float %71, 1.000000e+00, !dbg !44
+  %327 = fadd float %72, 1.000000e+00, !dbg !44
+  %328 = fadd float %73, 1.000000e+00, !dbg !44
+  %329 = fadd float %74, 1.000000e+00, !dbg !44
+  %330 = fadd float %75, 1.000000e+00, !dbg !44
+  %331 = fadd float %76, 1.000000e+00, !dbg !44
+  %332 = fadd float %77, 1.000000e+00, !dbg !44
+  %333 = fadd float %78, 1.000000e+00, !dbg !44
+  %334 = fadd float %79, 1.000000e+00, !dbg !44
+  %335 = fadd float %80, 1.000000e+00, !dbg !44
+  %336 = fadd float %81, 1.000000e+00, !dbg !44
+  %337 = fadd float %82, 1.000000e+00, !dbg !44
+  %338 = fadd float %83, 1.000000e+00, !dbg !44
+  %339 = fadd float %84, 1.000000e+00, !dbg !44
+  %340 = fadd float %85, 1.000000e+00, !dbg !44
+  %341 = fadd float %86, 1.000000e+00, !dbg !44
+  %342 = fadd float %87, 1.000000e+00, !dbg !44
+  %343 = fadd float %88, 1.000000e+00, !dbg !44
+  %344 = fadd float %89, 1.000000e+00, !dbg !44
+  %345 = fadd float %90, 1.000000e+00, !dbg !44
+  %346 = fadd float %91, 1.000000e+00, !dbg !44
+  %347 = fadd float %92, 1.000000e+00, !dbg !44
+  %348 = fadd float %93, 1.000000e+00, !dbg !44
+  %349 = fadd float %94, 1.000000e+00, !dbg !44
+  %350 = fadd float %95, 1.000000e+00, !dbg !44
+  %351 = fadd float %96, 1.000000e+00, !dbg !44
+  %352 = fadd float %97, 1.000000e+00, !dbg !44
+  %353 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %305, float %321) #6, !dbg !45
+  %354 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %306, float %322) #6, !dbg !45
+  %355 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %307, float %323) #6, !dbg !45
+  %356 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %308, float %324) #6, !dbg !45
+  %357 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %309, float %325) #6, !dbg !45
+  %358 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %310, float %326) #6, !dbg !45
+  %359 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %311, float %327) #6, !dbg !45
+  %360 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %312, float %328) #6, !dbg !45
+  %361 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %313, float %329) #6, !dbg !45
+  %362 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %314, float %330) #6, !dbg !45
+  %363 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %315, float %331) #6, !dbg !45
+  %364 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %316, float %332) #6, !dbg !45
+  %365 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %317, float %333) #6, !dbg !45
+  %366 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %318, float %334) #6, !dbg !45
+  %367 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %319, float %335) #6, !dbg !45
+  %368 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %320, float %336) #6, !dbg !45
+  %369 = fadd float %114, %353, !dbg !46
+  %370 = fadd float %115, %354, !dbg !46
+  %371 = fadd float %116, %355, !dbg !46
+  %372 = fadd float %117, %356, !dbg !46
+  %373 = fadd float %118, %357, !dbg !46
+  %374 = fadd float %119, %358, !dbg !46
+  %375 = fadd float %120, %359, !dbg !46
+  %376 = fadd float %121, %360, !dbg !46
+  %377 = fadd float %122, %361, !dbg !46
+  %378 = fadd float %123, %362, !dbg !46
+  %379 = fadd float %124, %363, !dbg !46
+  %380 = fadd float %125, %364, !dbg !46
+  %381 = fadd float %126, %365, !dbg !46
+  %382 = fadd float %127, %366, !dbg !46
+  %383 = fadd float %128, %367, !dbg !46
+  %384 = fadd float %129, %368, !dbg !46
+  %385 = fsub float %289, %369, !dbg !47
+  %386 = fsub float %290, %370, !dbg !47
+  %387 = fsub float %291, %371, !dbg !47
+  %388 = fsub float %292, %372, !dbg !47
+  %389 = fsub float %293, %373, !dbg !47
+  %390 = fsub float %294, %374, !dbg !47
+  %391 = fsub float %295, %375, !dbg !47
+  %392 = fsub float %296, %376, !dbg !47
+  %393 = fsub float %297, %377, !dbg !47
+  %394 = fsub float %298, %378, !dbg !47
+  %395 = fsub float %299, %379, !dbg !47
+  %396 = fsub float %300, %380, !dbg !47
+  %397 = fsub float %301, %381, !dbg !47
+  %398 = fsub float %302, %382, !dbg !47
+  %399 = fsub float %303, %383, !dbg !47
+  %400 = fsub float %304, %384, !dbg !47
+  %401 = fmul float %305, %385, !dbg !48
+  %402 = fmul float %306, %386, !dbg !48
+  %403 = fmul float %307, %387, !dbg !48
+  %404 = fmul float %308, %388, !dbg !48
+  %405 = fmul float %309, %389, !dbg !48
+  %406 = fmul float %310, %390, !dbg !48
+  %407 = fmul float %311, %391, !dbg !48
+  %408 = fmul float %312, %392, !dbg !48
+  %409 = fmul float %313, %393, !dbg !48
+  %410 = fmul float %314, %394, !dbg !48
+  %411 = fmul float %315, %395, !dbg !48
+  %412 = fmul float %316, %396, !dbg !48
+  %413 = fmul float %317, %397, !dbg !48
+  %414 = fmul float %318, %398, !dbg !48
+  %415 = fmul float %319, %399, !dbg !48
+  %416 = fmul float %320, %400, !dbg !48
+  %417 = fadd float %98, %401, !dbg !49
+  %418 = fadd float %99, %402, !dbg !49
+  %419 = fadd float %100, %403, !dbg !49
+  %420 = fadd float %101, %404, !dbg !49
+  %421 = fadd float %102, %405, !dbg !49
+  %422 = fadd float %103, %406, !dbg !49
+  %423 = fadd float %104, %407, !dbg !49
+  %424 = fadd float %105, %408, !dbg !49
+  %425 = fadd float %106, %409, !dbg !49
+  %426 = fadd float %107, %410, !dbg !49
+  %427 = fadd float %108, %411, !dbg !49
+  %428 = fadd float %109, %412, !dbg !49
+  %429 = fadd float %110, %413, !dbg !49
+  %430 = fadd float %111, %414, !dbg !49
+  %431 = fadd float %112, %415, !dbg !49
+  %432 = fadd float %113, %416, !dbg !49
+  %433 = add nuw nsw i32 %130, 64, !dbg !12
+  %434 = icmp ult i32 %130, 192, !dbg !12
+  br i1 %434, label %65, label %435, !dbg !12
+435:                                              ; preds = %230
+  %436 = and i32 %16, 3, !dbg !12
+  %437 = mul nuw nsw i32 %436, 72, !dbg !12
+  %438 = add nuw nsw i32 %437, %12, !dbg !12
+  %439 = zext nneg i32 %438 to i64, !dbg !12
+  %440 = getelementptr float, ptr addrspace(3) @global_smem, i64 %439, !dbg !12
+  %441 = insertelement <1 x float> undef, float %337, i64 0, !dbg !12
+  store <1 x float> %441, ptr addrspace(3) %440, align 4, !dbg !12
+  %442 = add nuw nsw i32 %12, 288, !dbg !12
+  %443 = add nuw nsw i32 %442, %437, !dbg !12
+  %444 = zext nneg i32 %443 to i64, !dbg !12
+  %445 = getelementptr float, ptr addrspace(3) @global_smem, i64 %444, !dbg !12
+  %446 = insertelement <1 x float> undef, float %338, i64 0, !dbg !12
+  store <1 x float> %446, ptr addrspace(3) %445, align 4, !dbg !12
+  %447 = or i32 %12, 576, !dbg !12
+  %448 = add nuw nsw i32 %447, %437, !dbg !12
+  %449 = zext nneg i32 %448 to i64, !dbg !12
+  %450 = getelementptr float, ptr addrspace(3) @global_smem, i64 %449, !dbg !12
+  %451 = insertelement <1 x float> undef, float %339, i64 0, !dbg !12
+  store <1 x float> %451, ptr addrspace(3) %450, align 4, !dbg !12
+  %452 = add nuw nsw i32 %12, 864, !dbg !12
+  %453 = add nuw nsw i32 %452, %437, !dbg !12
+  %454 = zext nneg i32 %453 to i64, !dbg !12
+  %455 = getelementptr float, ptr addrspace(3) @global_smem, i64 %454, !dbg !12
+  %456 = insertelement <1 x float> undef, float %340, i64 0, !dbg !12
+  store <1 x float> %456, ptr addrspace(3) %455, align 4, !dbg !12
+  %457 = or i32 %12, 1152, !dbg !12
+  %458 = add nuw nsw i32 %457, %437, !dbg !12
+  %459 = zext nneg i32 %458 to i64, !dbg !12
+  %460 = getelementptr float, ptr addrspace(3) @global_smem, i64 %459, !dbg !12
+  %461 = insertelement <1 x float> undef, float %341, i64 0, !dbg !12
+  store <1 x float> %461, ptr addrspace(3) %460, align 4, !dbg !12
+  %462 = add nuw nsw i32 %12, 1440, !dbg !12
+  %463 = add nuw nsw i32 %462, %437, !dbg !12
+  %464 = zext nneg i32 %463 to i64, !dbg !12
+  %465 = getelementptr float, ptr addrspace(3) @global_smem, i64 %464, !dbg !12
+  %466 = insertelement <1 x float> undef, float %342, i64 0, !dbg !12
+  store <1 x float> %466, ptr addrspace(3) %465, align 4, !dbg !12
+  %467 = or i32 %12, 1728, !dbg !12
+  %468 = add nuw nsw i32 %467, %437, !dbg !12
+  %469 = zext nneg i32 %468 to i64, !dbg !12
+  %470 = getelementptr float, ptr addrspace(3) @global_smem, i64 %469, !dbg !12
+  %471 = insertelement <1 x float> undef, float %343, i64 0, !dbg !12
+  store <1 x float> %471, ptr addrspace(3) %470, align 4, !dbg !12
+  %472 = add nuw nsw i32 %12, 2016, !dbg !12
+  %473 = add nuw nsw i32 %472, %437, !dbg !12
+  %474 = zext nneg i32 %473 to i64, !dbg !12
+  %475 = getelementptr float, ptr addrspace(3) @global_smem, i64 %474, !dbg !12
+  %476 = insertelement <1 x float> undef, float %344, i64 0, !dbg !12
+  store <1 x float> %476, ptr addrspace(3) %475, align 4, !dbg !12
+  tail call void @llvm.nvvm.barrier0(), !dbg !12
+  %477 = mul nuw nsw i32 %11, 72, !dbg !12
+  %478 = add nuw nsw i32 %477, %14, !dbg !12
+  %479 = zext nneg i32 %478 to i64, !dbg !12
+  %480 = getelementptr float, ptr addrspace(3) @global_smem, i64 %479, !dbg !12
+  %481 = load float, ptr addrspace(3) %480, align 32, !dbg !12
+  %482 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 1, !dbg !12
+  %483 = load float, ptr addrspace(3) %482, align 4, !dbg !12
+  %484 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 2, !dbg !12
+  %485 = load float, ptr addrspace(3) %484, align 8, !dbg !12
+  %486 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 3, !dbg !12
+  %487 = load float, ptr addrspace(3) %486, align 4, !dbg !12
+  %488 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 4, !dbg !12
+  %489 = load float, ptr addrspace(3) %488, align 16, !dbg !12
+  %490 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 5, !dbg !12
+  %491 = load float, ptr addrspace(3) %490, align 4, !dbg !12
+  %492 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 6, !dbg !12
+  %493 = load float, ptr addrspace(3) %492, align 8, !dbg !12
+  %494 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 7, !dbg !12
+  %495 = load float, ptr addrspace(3) %494, align 4, !dbg !12
+  tail call void @llvm.nvvm.barrier0(), !dbg !12
+  %496 = insertelement <1 x float> undef, float %345, i64 0, !dbg !12
+  store <1 x float> %496, ptr addrspace(3) %440, align 4, !dbg !12
+  %497 = insertelement <1 x float> undef, float %346, i64 0, !dbg !12
+  store <1 x float> %497, ptr addrspace(3) %445, align 4, !dbg !12
+  %498 = insertelement <1 x float> undef, float %347, i64 0, !dbg !12
+  store <1 x float> %498, ptr addrspace(3) %450, align 4, !dbg !12
+  %499 = insertelement <1 x float> undef, float %348, i64 0, !dbg !12
+  store <1 x float> %499, ptr addrspace(3) %455, align 4, !dbg !12
+  %500 = insertelement <1 x float> undef, float %349, i64 0, !dbg !12
+  store <1 x float> %500, ptr addrspace(3) %460, align 4, !dbg !12
+  %501 = insertelement <1 x float> undef, float %350, i64 0, !dbg !12
+  store <1 x float> %501, ptr addrspace(3) %465, align 4, !dbg !12
+  %502 = insertelement <1 x float> undef, float %351, i64 0, !dbg !12
+  store <1 x float> %502, ptr addrspace(3) %470, align 4, !dbg !12
+  %503 = insertelement <1 x float> undef, float %352, i64 0, !dbg !12
+  store <1 x float> %503, ptr addrspace(3) %475, align 4, !dbg !12
+  tail call void @llvm.nvvm.barrier0(), !dbg !12
+  %504 = load float, ptr addrspace(3) %480, align 32, !dbg !12
+  %505 = load float, ptr addrspace(3) %482, align 4, !dbg !12
+  %506 = load float, ptr addrspace(3) %484, align 8, !dbg !12
+  %507 = load float, ptr addrspace(3) %486, align 4, !dbg !12
+  %508 = load float, ptr addrspace(3) %488, align 16, !dbg !12
+  %509 = load float, ptr addrspace(3) %490, align 4, !dbg !12
+  %510 = load float, ptr addrspace(3) %492, align 8, !dbg !12
+  %511 = load float, ptr addrspace(3) %494, align 4, !dbg !12
+  %512 = fsub float %370, %369, !dbg !50
+  %513 = fadd float %481, %483, !dbg !54
+  %514 = fcmp oeq float %513, 0.000000e+00, !dbg !55
+  %515 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %483, float %513) #6, !dbg !56
+  %516 = select i1 %514, float 0.000000e+00, float %515, !dbg !57
+  %517 = fmul float %512, %516, !dbg !58
+  %518 = fadd float %369, %517, !dbg !59
+  %519 = fadd float %417, %418, !dbg !60
+  %520 = fmul float %512, %512, !dbg !61
+  %521 = fmul float %520, %481, !dbg !62
+  %522 = fmul float %521, %516, !dbg !63
+  %523 = fadd float %519, %522, !dbg !64
+  %524 = fsub float %371, %518, !dbg !50
+  %525 = fadd float %485, %513, !dbg !54
+  %526 = fcmp oeq float %525, 0.000000e+00, !dbg !55
+  %527 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %485, float %525) #6, !dbg !56
+  %528 = select i1 %526, float 0.000000e+00, float %527, !dbg !57
+  %529 = fmul float %528, %524, !dbg !58
+  %530 = fadd float %518, %529, !dbg !59
+  %531 = fadd float %419, %523, !dbg !60
+  %532 = fmul float %524, %524, !dbg !61
+  %533 = fmul float %513, %532, !dbg !62
+  %534 = fmul float %528, %533, !dbg !63
+  %535 = fadd float %531, %534, !dbg !64
+  %536 = fsub float %372, %530, !dbg !50
+  %537 = fadd float %487, %525, !dbg !54
+  %538 = fcmp oeq float %537, 0.000000e+00, !dbg !55
+  %539 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %487, float %537) #6, !dbg !56
+  %540 = select i1 %538, float 0.000000e+00, float %539, !dbg !57
+  %541 = fmul float %540, %536, !dbg !58
+  %542 = fadd float %530, %541, !dbg !59
+  %543 = fadd float %420, %535, !dbg !60
+  %544 = fmul float %536, %536, !dbg !61
+  %545 = fmul float %525, %544, !dbg !62
+  %546 = fmul float %540, %545, !dbg !63
+  %547 = fadd float %543, %546, !dbg !64
+  %548 = fsub float %373, %542, !dbg !50
+  %549 = fadd float %489, %537, !dbg !54
+  %550 = fcmp oeq float %549, 0.000000e+00, !dbg !55
+  %551 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %489, float %549) #6, !dbg !56
+  %552 = select i1 %550, float 0.000000e+00, float %551, !dbg !57
+  %553 = fmul float %552, %548, !dbg !58
+  %554 = fadd float %542, %553, !dbg !59
+  %555 = fadd float %421, %547, !dbg !60
+  %556 = fmul float %548, %548, !dbg !61
+  %557 = fmul float %537, %556, !dbg !62
+  %558 = fmul float %552, %557, !dbg !63
+  %559 = fadd float %555, %558, !dbg !64
+  %560 = fsub float %374, %554, !dbg !50
+  %561 = fadd float %491, %549, !dbg !54
+  %562 = fcmp oeq float %561, 0.000000e+00, !dbg !55
+  %563 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %491, float %561) #6, !dbg !56
+  %564 = select i1 %562, float 0.000000e+00, float %563, !dbg !57
+  %565 = fmul float %564, %560, !dbg !58
+  %566 = fadd float %554, %565, !dbg !59
+  %567 = fadd float %422, %559, !dbg !60
+  %568 = fmul float %560, %560, !dbg !61
+  %569 = fmul float %549, %568, !dbg !62
+  %570 = fmul float %564, %569, !dbg !63
+  %571 = fadd float %567, %570, !dbg !64
+  %572 = fsub float %375, %566, !dbg !50
+  %573 = fadd float %493, %561, !dbg !54
+  %574 = fcmp oeq float %573, 0.000000e+00, !dbg !55
+  %575 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %493, float %573) #6, !dbg !56
+  %576 = select i1 %574, float 0.000000e+00, float %575, !dbg !57
+  %577 = fmul float %576, %572, !dbg !58
+  %578 = fadd float %566, %577, !dbg !59
+  %579 = fadd float %423, %571, !dbg !60
+  %580 = fmul float %572, %572, !dbg !61
+  %581 = fmul float %561, %580, !dbg !62
+  %582 = fmul float %576, %581, !dbg !63
+  %583 = fadd float %579, %582, !dbg !64
+  %584 = fsub float %376, %578, !dbg !50
+  %585 = fadd float %495, %573, !dbg !54
+  %586 = fcmp oeq float %585, 0.000000e+00, !dbg !55
+  %587 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %495, float %585) #6, !dbg !56
+  %588 = select i1 %586, float 0.000000e+00, float %587, !dbg !57
+  %589 = fmul float %588, %584, !dbg !58
+  %590 = fadd float %578, %589, !dbg !59
+  %591 = fadd float %424, %583, !dbg !60
+  %592 = fmul float %584, %584, !dbg !61
+  %593 = fmul float %573, %592, !dbg !62
+  %594 = fmul float %588, %593, !dbg !63
+  %595 = fadd float %591, %594, !dbg !64
+  %596 = fsub float %378, %377, !dbg !50
+  %597 = fadd float %504, %505, !dbg !54
+  %598 = fcmp oeq float %597, 0.000000e+00, !dbg !55
+  %599 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %505, float %597) #6, !dbg !56
+  %600 = select i1 %598, float 0.000000e+00, float %599, !dbg !57
+  %601 = fmul float %596, %600, !dbg !58
+  %602 = fadd float %377, %601, !dbg !59
+  %603 = fadd float %425, %426, !dbg !60
+  %604 = fmul float %596, %596, !dbg !61
+  %605 = fmul float %604, %504, !dbg !62
+  %606 = fmul float %605, %600, !dbg !63
+  %607 = fadd float %603, %606, !dbg !64
+  %608 = fsub float %379, %602, !dbg !50
+  %609 = fadd float %506, %597, !dbg !54
+  %610 = fcmp oeq float %609, 0.000000e+00, !dbg !55
+  %611 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %506, float %609) #6, !dbg !56
+  %612 = select i1 %610, float 0.000000e+00, float %611, !dbg !57
+  %613 = fmul float %612, %608, !dbg !58
+  %614 = fadd float %602, %613, !dbg !59
+  %615 = fadd float %427, %607, !dbg !60
+  %616 = fmul float %608, %608, !dbg !61
+  %617 = fmul float %597, %616, !dbg !62
+  %618 = fmul float %612, %617, !dbg !63
+  %619 = fadd float %615, %618, !dbg !64
+  %620 = fsub float %380, %614, !dbg !50
+  %621 = fadd float %507, %609, !dbg !54
+  %622 = fcmp oeq float %621, 0.000000e+00, !dbg !55
+  %623 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %507, float %621) #6, !dbg !56
+  %624 = select i1 %622, float 0.000000e+00, float %623, !dbg !57
+  %625 = fmul float %624, %620, !dbg !58
+  %626 = fadd float %614, %625, !dbg !59
+  %627 = fadd float %428, %619, !dbg !60
+  %628 = fmul float %620, %620, !dbg !61
+  %629 = fmul float %609, %628, !dbg !62
+  %630 = fmul float %624, %629, !dbg !63
+  %631 = fadd float %627, %630, !dbg !64
+  %632 = fsub float %381, %626, !dbg !50
+  %633 = fadd float %508, %621, !dbg !54
+  %634 = fcmp oeq float %633, 0.000000e+00, !dbg !55
+  %635 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %508, float %633) #6, !dbg !56
+  %636 = select i1 %634, float 0.000000e+00, float %635, !dbg !57
+  %637 = fmul float %636, %632, !dbg !58
+  %638 = fadd float %626, %637, !dbg !59
+  %639 = fadd float %429, %631, !dbg !60
+  %640 = fmul float %632, %632, !dbg !61
+  %641 = fmul float %621, %640, !dbg !62
+  %642 = fmul float %636, %641, !dbg !63
+  %643 = fadd float %639, %642, !dbg !64
+  %644 = fsub float %382, %638, !dbg !50
+  %645 = fadd float %509, %633, !dbg !54
+  %646 = fcmp oeq float %645, 0.000000e+00, !dbg !55
+  %647 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %509, float %645) #6, !dbg !56
+  %648 = select i1 %646, float 0.000000e+00, float %647, !dbg !57
+  %649 = fmul float %648, %644, !dbg !58
+  %650 = fadd float %638, %649, !dbg !59
+  %651 = fadd float %430, %643, !dbg !60
+  %652 = fmul float %644, %644, !dbg !61
+  %653 = fmul float %633, %652, !dbg !62
+  %654 = fmul float %648, %653, !dbg !63
+  %655 = fadd float %651, %654, !dbg !64
+  %656 = fsub float %383, %650, !dbg !50
+  %657 = fadd float %510, %645, !dbg !54
+  %658 = fcmp oeq float %657, 0.000000e+00, !dbg !55
+  %659 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %510, float %657) #6, !dbg !56
+  %660 = select i1 %658, float 0.000000e+00, float %659, !dbg !57
+  %661 = fmul float %660, %656, !dbg !58
+  %662 = fadd float %650, %661, !dbg !59
+  %663 = fadd float %431, %655, !dbg !60
+  %664 = fmul float %656, %656, !dbg !61
+  %665 = fmul float %645, %664, !dbg !62
+  %666 = fmul float %660, %665, !dbg !63
+  %667 = fadd float %663, %666, !dbg !64
+  %668 = fsub float %384, %662, !dbg !50
+  %669 = fadd float %511, %657, !dbg !54
+  %670 = fcmp oeq float %669, 0.000000e+00, !dbg !55
+  %671 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %511, float %669) #6, !dbg !56
+  %672 = select i1 %670, float 0.000000e+00, float %671, !dbg !57
+  %673 = fmul float %672, %668, !dbg !58
+  %674 = fadd float %662, %673, !dbg !59
+  %675 = fadd float %432, %667, !dbg !60
+  %676 = fmul float %668, %668, !dbg !61
+  %677 = fmul float %657, %676, !dbg !62
+  %678 = fmul float %672, %677, !dbg !63
+  %679 = fadd float %675, %678, !dbg !64
+  %680 = bitcast float %590 to i32, !dbg !65
+  %681 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %680, i32 4, i32 31), !dbg !65
+  %682 = bitcast i32 %681 to float, !dbg !65
+  %683 = bitcast float %595 to i32, !dbg !65
+  %684 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %683, i32 4, i32 31), !dbg !65
+  %685 = bitcast i32 %684 to float, !dbg !65
+  %686 = bitcast float %585 to i32, !dbg !65
+  %687 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %686, i32 4, i32 31), !dbg !65
+  %688 = bitcast i32 %687 to float, !dbg !65
+  %689 = fsub float %682, %590, !dbg !50
+  %690 = fadd float %585, %688, !dbg !54
+  %691 = fcmp oeq float %690, 0.000000e+00, !dbg !55
+  %692 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %688, float %690) #6, !dbg !56
+  %693 = select i1 %691, float 0.000000e+00, float %692, !dbg !57
+  %694 = fmul float %693, %689, !dbg !58
+  %695 = fadd float %590, %694, !dbg !59
+  %696 = fadd float %595, %685, !dbg !60
+  %697 = fmul float %689, %689, !dbg !61
+  %698 = fmul float %585, %697, !dbg !62
+  %699 = fmul float %693, %698, !dbg !63
+  %700 = fadd float %696, %699, !dbg !64
+  %701 = bitcast float %695 to i32, !dbg !65
+  %702 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %701, i32 2, i32 31), !dbg !65
+  %703 = bitcast i32 %702 to float, !dbg !65
+  %704 = bitcast float %700 to i32, !dbg !65
+  %705 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %704, i32 2, i32 31), !dbg !65
+  %706 = bitcast i32 %705 to float, !dbg !65
+  %707 = bitcast float %690 to i32, !dbg !65
+  %708 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %707, i32 2, i32 31), !dbg !65
+  %709 = bitcast i32 %708 to float, !dbg !65
+  %710 = fsub float %703, %695, !dbg !50
+  %711 = fadd float %690, %709, !dbg !54
+  %712 = fcmp oeq float %711, 0.000000e+00, !dbg !55
+  %713 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %709, float %711) #6, !dbg !56
+  %714 = select i1 %712, float 0.000000e+00, float %713, !dbg !57
+  %715 = fmul float %714, %710, !dbg !58
+  %716 = fadd float %695, %715, !dbg !59
+  %717 = fadd float %700, %706, !dbg !60
+  %718 = fmul float %710, %710, !dbg !61
+  %719 = fmul float %690, %718, !dbg !62
+  %720 = fmul float %714, %719, !dbg !63
+  %721 = fadd float %717, %720, !dbg !64
+  %722 = bitcast float %716 to i32, !dbg !65
+  %723 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %722, i32 1, i32 31), !dbg !65
+  %724 = bitcast i32 %723 to float, !dbg !65
+  %725 = bitcast float %721 to i32, !dbg !65
+  %726 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %725, i32 1, i32 31), !dbg !65
+  %727 = bitcast i32 %726 to float, !dbg !65
+  %728 = bitcast float %711 to i32, !dbg !65
+  %729 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %728, i32 1, i32 31), !dbg !65
+  %730 = bitcast i32 %729 to float, !dbg !65
+  %731 = fsub float %724, %716, !dbg !50
+  %732 = fadd float %711, %730, !dbg !54
+  %733 = fcmp oeq float %732, 0.000000e+00, !dbg !55
+  %734 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %730, float %732) #6, !dbg !56
+  %735 = select i1 %733, float 0.000000e+00, float %734, !dbg !57
+  %736 = fmul float %731, %735, !dbg !58
+  %737 = fadd float %716, %736, !dbg !59
+  %738 = fadd float %721, %727, !dbg !60
+  %739 = fmul float %731, %731, !dbg !61
+  %740 = fmul float %711, %739, !dbg !62
+  %741 = fmul float %735, %740, !dbg !63
+  %742 = fadd float %738, %741, !dbg !64
+  %743 = bitcast float %674 to i32, !dbg !65
+  %744 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %743, i32 4, i32 31), !dbg !65
+  %745 = bitcast i32 %744 to float, !dbg !65
+  %746 = bitcast float %679 to i32, !dbg !65
+  %747 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %746, i32 4, i32 31), !dbg !65
+  %748 = bitcast i32 %747 to float, !dbg !65
+  %749 = bitcast float %669 to i32, !dbg !65
+  %750 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %749, i32 4, i32 31), !dbg !65
+  %751 = bitcast i32 %750 to float, !dbg !65
+  %752 = fsub float %745, %674, !dbg !50
+  %753 = fadd float %669, %751, !dbg !54
+  %754 = fcmp oeq float %753, 0.000000e+00, !dbg !55
+  %755 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %751, float %753) #6, !dbg !56
+  %756 = select i1 %754, float 0.000000e+00, float %755, !dbg !57
+  %757 = fmul float %752, %756, !dbg !58
+  %758 = fadd float %674, %757, !dbg !59
+  %759 = fadd float %679, %748, !dbg !60
+  %760 = fmul float %752, %752, !dbg !61
+  %761 = fmul float %669, %760, !dbg !62
+  %762 = fmul float %761, %756, !dbg !63
+  %763 = fadd float %759, %762, !dbg !64
+  %764 = bitcast float %758 to i32, !dbg !65
+  %765 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %764, i32 2, i32 31), !dbg !65
+  %766 = bitcast i32 %765 to float, !dbg !65
+  %767 = bitcast float %763 to i32, !dbg !65
+  %768 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %767, i32 2, i32 31), !dbg !65
+  %769 = bitcast i32 %768 to float, !dbg !65
+  %770 = bitcast float %753 to i32, !dbg !65
+  %771 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %770, i32 2, i32 31), !dbg !65
+  %772 = bitcast i32 %771 to float, !dbg !65
+  %773 = fsub float %766, %758, !dbg !50
+  %774 = fadd float %753, %772, !dbg !54
+  %775 = fcmp oeq float %774, 0.000000e+00, !dbg !55
+  %776 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %772, float %774) #6, !dbg !56
+  %777 = select i1 %775, float 0.000000e+00, float %776, !dbg !57
+  %778 = fmul float %773, %777, !dbg !58
+  %779 = fadd float %758, %778, !dbg !59
+  %780 = fadd float %763, %769, !dbg !60
+  %781 = fmul float %773, %773, !dbg !61
+  %782 = fmul float %753, %781, !dbg !62
+  %783 = fmul float %777, %782, !dbg !63
+  %784 = fadd float %780, %783, !dbg !64
+  %785 = bitcast float %779 to i32, !dbg !65
+  %786 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %785, i32 1, i32 31), !dbg !65
+  %787 = bitcast i32 %786 to float, !dbg !65
+  %788 = bitcast float %784 to i32, !dbg !65
+  %789 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %788, i32 1, i32 31), !dbg !65
+  %790 = bitcast i32 %789 to float, !dbg !65
+  %791 = bitcast float %774 to i32, !dbg !65
+  %792 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %791, i32 1, i32 31), !dbg !65
+  %793 = bitcast i32 %792 to float, !dbg !65
+  %794 = fsub float %787, %779, !dbg !50
+  %795 = fadd float %774, %793, !dbg !54
+  %796 = fcmp oeq float %795, 0.000000e+00, !dbg !55
+  %797 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %793, float %795) #6, !dbg !56
+  %798 = select i1 %796, float 0.000000e+00, float %797, !dbg !57
+  %799 = fmul float %794, %798, !dbg !58
+  %800 = fadd float %779, %799, !dbg !59
+  %801 = fadd float %784, %790, !dbg !60
+  %802 = fmul float %794, %794, !dbg !61
+  %803 = fmul float %774, %802, !dbg !62
+  %804 = fmul float %798, %803, !dbg !63
+  %805 = fadd float %801, %804, !dbg !64
+  %806 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67
+  %807 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67
+  %808 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67
+  %809 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67
+  %810 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67
+  %811 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67
+  %812 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67
+  %813 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67
+  %814 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67
+  %815 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67
+  %816 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67
+  %817 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67
+  %818 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67
+  %819 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67
+  %820 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67
+  %821 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67
+  %822 = fadd float %806, 0x3EE4F8B580000000, !dbg !68
+  %823 = fadd float %814, 0x3EE4F8B580000000, !dbg !68
+  br label %824, !dbg !69
+824:                                              ; preds = %435, %__nv_rsqrtf.exit40
+  %825 = phi i32 [ 0, %435 ], [ %1134, %__nv_rsqrtf.exit40 ]
+  %826 = or i32 %825, %14, !dbg !70
+  %827 = or i32 %825, %15, !dbg !70
+  %828 = add i32 %826, %47, !dbg !71
+  %829 = add i32 %827, %47, !dbg !71
+  %830 = add i32 %826, %48, !dbg !71
+  %831 = add i32 %827, %48, !dbg !71
+  %832 = sext i32 %828 to i64, !dbg !72
+  %833 = getelementptr float, ptr addrspace(1) %2, i64 %832, !dbg !72
+  %834 = sext i32 %829 to i64, !dbg !72
+  %835 = getelementptr float, ptr addrspace(1) %2, i64 %834, !dbg !72
+  %836 = sext i32 %830 to i64, !dbg !72
+  %837 = getelementptr float, ptr addrspace(1) %2, i64 %836, !dbg !72
+  %838 = sext i32 %831 to i64, !dbg !72
+  %839 = getelementptr float, ptr addrspace(1) %2, i64 %838, !dbg !72
+  %840 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %833, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
+  %841 = extractvalue { i32, i32, i32, i32 } %840, 0, !dbg !73
+  %842 = extractvalue { i32, i32, i32, i32 } %840, 1, !dbg !73
+  %843 = extractvalue { i32, i32, i32, i32 } %840, 2, !dbg !73
+  %844 = extractvalue { i32, i32, i32, i32 } %840, 3, !dbg !73
+  %845 = bitcast i32 %841 to float, !dbg !73
+  %846 = bitcast i32 %842 to float, !dbg !73
+  %847 = bitcast i32 %843 to float, !dbg !73
+  %848 = bitcast i32 %844 to float, !dbg !73
+  %849 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %835, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
+  %850 = extractvalue { i32, i32, i32, i32 } %849, 0, !dbg !73
+  %851 = extractvalue { i32, i32, i32, i32 } %849, 1, !dbg !73
+  %852 = extractvalue { i32, i32, i32, i32 } %849, 2, !dbg !73
+  %853 = extractvalue { i32, i32, i32, i32 } %849, 3, !dbg !73
+  %854 = bitcast i32 %850 to float, !dbg !73
+  %855 = bitcast i32 %851 to float, !dbg !73
+  %856 = bitcast i32 %852 to float, !dbg !73
+  %857 = bitcast i32 %853 to float, !dbg !73
+  %858 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %837, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
+  %859 = extractvalue { i32, i32, i32, i32 } %858, 0, !dbg !73
+  %860 = extractvalue { i32, i32, i32, i32 } %858, 1, !dbg !73
+  %861 = extractvalue { i32, i32, i32, i32 } %858, 2, !dbg !73
+  %862 = extractvalue { i32, i32, i32, i32 } %858, 3, !dbg !73
+  %863 = bitcast i32 %859 to float, !dbg !73
+  %864 = bitcast i32 %860 to float, !dbg !73
+  %865 = bitcast i32 %861 to float, !dbg !73
+  %866 = bitcast i32 %862 to float, !dbg !73
+  %867 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %839, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
+  %868 = extractvalue { i32, i32, i32, i32 } %867, 0, !dbg !73
+  %869 = extractvalue { i32, i32, i32, i32 } %867, 1, !dbg !73
+  %870 = extractvalue { i32, i32, i32, i32 } %867, 2, !dbg !73
+  %871 = extractvalue { i32, i32, i32, i32 } %867, 3, !dbg !73
+  %872 = bitcast i32 %868 to float, !dbg !73
+  %873 = bitcast i32 %869 to float, !dbg !73
+  %874 = bitcast i32 %870 to float, !dbg !73
+  %875 = bitcast i32 %871 to float, !dbg !73
+  %876 = add i32 %826, %49, !dbg !74
+  %877 = add i32 %826, %50, !dbg !74
+  %878 = sext i32 %876 to i64, !dbg !75
+  %879 = getelementptr i16, ptr addrspace(1) %3, i64 %878, !dbg !75
+  %880 = sext i32 %877 to i64, !dbg !75
+  %881 = getelementptr i16, ptr addrspace(1) %3, i64 %880, !dbg !75
+  %882 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %879, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !76
+  %883 = extractvalue { i32, i32, i32, i32 } %882, 0, !dbg !76
+  %884 = extractvalue { i32, i32, i32, i32 } %882, 1, !dbg !76
+  %885 = extractvalue { i32, i32, i32, i32 } %882, 2, !dbg !76
+  %886 = extractvalue { i32, i32, i32, i32 } %882, 3, !dbg !76
+  %887 = trunc i32 %883 to i16, !dbg !76
+  %extelt.offset = lshr i32 %883, 16, !dbg !76
+  %888 = trunc i32 %extelt.offset to i16, !dbg !76
+  %889 = trunc i32 %884 to i16, !dbg !76
+  %extelt.offset2 = lshr i32 %884, 16, !dbg !76
+  %890 = trunc i32 %extelt.offset2 to i16, !dbg !76
+  %891 = trunc i32 %885 to i16, !dbg !76
+  %extelt.offset3 = lshr i32 %885, 16, !dbg !76
+  %892 = trunc i32 %extelt.offset3 to i16, !dbg !76
+  %893 = trunc i32 %886 to i16, !dbg !76
+  %extelt.offset4 = lshr i32 %886, 16, !dbg !76
+  %894 = trunc i32 %extelt.offset4 to i16, !dbg !76
+  %895 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %881, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !76
+  %896 = extractvalue { i32, i32, i32, i32 } %895, 0, !dbg !76
+  %897 = extractvalue { i32, i32, i32, i32 } %895, 1, !dbg !76
+  %898 = extractvalue { i32, i32, i32, i32 } %895, 2, !dbg !76
+  %899 = extractvalue { i32, i32, i32, i32 } %895, 3, !dbg !76
+  %900 = trunc i32 %896 to i16, !dbg !76
+  %extelt.offset5 = lshr i32 %896, 16, !dbg !76
+  %901 = trunc i32 %extelt.offset5 to i16, !dbg !76
+  %902 = trunc i32 %897 to i16, !dbg !76
+  %extelt.offset6 = lshr i32 %897, 16, !dbg !76
+  %903 = trunc i32 %extelt.offset6 to i16, !dbg !76
+  %904 = trunc i32 %898 to i16, !dbg !76
+  %extelt.offset7 = lshr i32 %898, 16, !dbg !76
+  %905 = trunc i32 %extelt.offset7 to i16, !dbg !76
+  %906 = trunc i32 %899 to i16, !dbg !76
+  %extelt.offset8 = lshr i32 %899, 16, !dbg !76
+  %907 = trunc i32 %extelt.offset8 to i16, !dbg !76
+  %908 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %887) #6, !dbg !77
+  %909 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %888) #6, !dbg !77
+  %910 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %889) #6, !dbg !77
+  %911 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %890) #6, !dbg !77
+  %912 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %891) #6, !dbg !77
+  %913 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %892) #6, !dbg !77
+  %914 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %893) #6, !dbg !77
+  %915 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %894) #6, !dbg !77
+  %916 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %900) #6, !dbg !77
+  %917 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %901) #6, !dbg !77
+  %918 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %902) #6, !dbg !77
+  %919 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %903) #6, !dbg !77
+  %920 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %904) #6, !dbg !77
+  %921 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %905) #6, !dbg !77
+  %922 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %906) #6, !dbg !77
+  %923 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %907) #6, !dbg !77
+  %924 = zext nneg i32 %826 to i64, !dbg !78
+  %925 = getelementptr float, ptr addrspace(1) %4, i64 %924, !dbg !78
+  %926 = zext nneg i32 %827 to i64, !dbg !78
+  %927 = getelementptr float, ptr addrspace(1) %4, i64 %926, !dbg !78
+  %928 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %925, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !79
+  %929 = extractvalue { i32, i32, i32, i32 } %928, 0, !dbg !79
+  %930 = extractvalue { i32, i32, i32, i32 } %928, 1, !dbg !79
+  %931 = extractvalue { i32, i32, i32, i32 } %928, 2, !dbg !79
+  %932 = extractvalue { i32, i32, i32, i32 } %928, 3, !dbg !79
+  %933 = bitcast i32 %929 to float, !dbg !79
+  %934 = bitcast i32 %930 to float, !dbg !79
+  %935 = bitcast i32 %931 to float, !dbg !79
+  %936 = bitcast i32 %932 to float, !dbg !79
+  %937 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %927, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !79
+  %938 = extractvalue { i32, i32, i32, i32 } %937, 0, !dbg !79
+  %939 = extractvalue { i32, i32, i32, i32 } %937, 1, !dbg !79
+  %940 = extractvalue { i32, i32, i32, i32 } %937, 2, !dbg !79
+  %941 = extractvalue { i32, i32, i32, i32 } %937, 3, !dbg !79
+  %942 = bitcast i32 %938 to float, !dbg !79
+  %943 = bitcast i32 %939 to float, !dbg !79
+  %944 = bitcast i32 %940 to float, !dbg !79
+  %945 = bitcast i32 %941 to float, !dbg !79
+  br i1 %56, label %946, label %947, !dbg !80
+946:                                              ; preds = %824
+  tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !80
+  br label %947, !dbg !80
+947:                                              ; preds = %946, %824
+  %948 = getelementptr float, ptr addrspace(1) %63, i64 %924, !dbg !81
+  %949 = getelementptr float, ptr addrspace(1) %63, i64 %926, !dbg !81
+  %950 = getelementptr float, ptr addrspace(1) %64, i64 %924, !dbg !81
+  %951 = getelementptr float, ptr addrspace(1) %64, i64 %926, !dbg !81
+  %952 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %948, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82
+  %953 = extractvalue { i32, i32, i32, i32 } %952, 0, !dbg !82
+  %954 = extractvalue { i32, i32, i32, i32 } %952, 1, !dbg !82
+  %955 = extractvalue { i32, i32, i32, i32 } %952, 2, !dbg !82
+  %956 = extractvalue { i32, i32, i32, i32 } %952, 3, !dbg !82
+  %957 = bitcast i32 %953 to float, !dbg !82
+  %958 = bitcast i32 %954 to float, !dbg !82
+  %959 = bitcast i32 %955 to float, !dbg !82
+  %960 = bitcast i32 %956 to float, !dbg !82
+  %961 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %949, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82
+  %962 = extractvalue { i32, i32, i32, i32 } %961, 0, !dbg !82
+  %963 = extractvalue { i32, i32, i32, i32 } %961, 1, !dbg !82
+  %964 = extractvalue { i32, i32, i32, i32 } %961, 2, !dbg !82
+  %965 = extractvalue { i32, i32, i32, i32 } %961, 3, !dbg !82
+  %966 = bitcast i32 %962 to float, !dbg !82
+  %967 = bitcast i32 %963 to float, !dbg !82
+  %968 = bitcast i32 %964 to float, !dbg !82
+  %969 = bitcast i32 %965 to float, !dbg !82
+  %970 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %950, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82
+  %971 = extractvalue { i32, i32, i32, i32 } %970, 0, !dbg !82
+  %972 = extractvalue { i32, i32, i32, i32 } %970, 1, !dbg !82
+  %973 = extractvalue { i32, i32, i32, i32 } %970, 2, !dbg !82
+  %974 = extractvalue { i32, i32, i32, i32 } %970, 3, !dbg !82
+  %975 = bitcast i32 %971 to float, !dbg !82
+  %976 = bitcast i32 %972 to float, !dbg !82
+  %977 = bitcast i32 %973 to float, !dbg !82
+  %978 = bitcast i32 %974 to float, !dbg !82
+  %979 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %951, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82
+  %980 = extractvalue { i32, i32, i32, i32 } %979, 0, !dbg !82
+  %981 = extractvalue { i32, i32, i32, i32 } %979, 1, !dbg !82
+  %982 = extractvalue { i32, i32, i32, i32 } %979, 2, !dbg !82
+  %983 = extractvalue { i32, i32, i32, i32 } %979, 3, !dbg !82
+  %984 = bitcast i32 %980 to float, !dbg !82
+  %985 = bitcast i32 %981 to float, !dbg !82
+  %986 = bitcast i32 %982 to float, !dbg !82
+  %987 = bitcast i32 %983 to float, !dbg !82
+  %988 = fadd float %845, %957, !dbg !83
+  %989 = fadd float %846, %958, !dbg !83
+  %990 = fadd float %847, %959, !dbg !83
+  %991 = fadd float %848, %960, !dbg !83
+  %992 = fadd float %854, %966, !dbg !83
+  %993 = fadd float %855, %967, !dbg !83
+  %994 = fadd float %856, %968, !dbg !83
+  %995 = fadd float %857, %969, !dbg !83
+  %996 = fadd float %863, %975, !dbg !83
+  %997 = fadd float %864, %976, !dbg !83
+  %998 = fadd float %865, %977, !dbg !83
+  %999 = fadd float %866, %978, !dbg !83
+  %1000 = fadd float %872, %984, !dbg !83
+  %1001 = fadd float %873, %985, !dbg !83
+  %1002 = fadd float %874, %986, !dbg !83
+  %1003 = fadd float %875, %987, !dbg !83
+  %1004 = fadd float %908, %988, !dbg !84
+  %1005 = fadd float %909, %989, !dbg !84
+  %1006 = fadd float %910, %990, !dbg !84
+  %1007 = fadd float %911, %991, !dbg !84
+  %1008 = fadd float %912, %992, !dbg !84
+  %1009 = fadd float %913, %993, !dbg !84
+  %1010 = fadd float %914, %994, !dbg !84
+  %1011 = fadd float %915, %995, !dbg !84
+  %1012 = fadd float %916, %996, !dbg !84
+  %1013 = fadd float %917, %997, !dbg !84
+  %1014 = fadd float %918, %998, !dbg !84
+  %1015 = fadd float %919, %999, !dbg !84
+  %1016 = fadd float %920, %1000, !dbg !84
+  %1017 = fadd float %921, %1001, !dbg !84
+  %1018 = fadd float %922, %1002, !dbg !84
+  %1019 = fadd float %923, %1003, !dbg !84
+  %1020 = fsub float %1004, %737, !dbg !85
+  %1021 = fsub float %1005, %737, !dbg !85
+  %1022 = fsub float %1006, %737, !dbg !85
+  %1023 = fsub float %1007, %737, !dbg !85
+  %1024 = fsub float %1008, %737, !dbg !85
+  %1025 = fsub float %1009, %737, !dbg !85
+  %1026 = fsub float %1010, %737, !dbg !85
+  %1027 = fsub float %1011, %737, !dbg !85
+  %1028 = fsub float %1012, %800, !dbg !85
+  %1029 = fsub float %1013, %800, !dbg !85
+  %1030 = fsub float %1014, %800, !dbg !85
+  %1031 = fsub float %1015, %800, !dbg !85
+  %1032 = fsub float %1016, %800, !dbg !85
+  %1033 = fsub float %1017, %800, !dbg !85
+  %1034 = fsub float %1018, %800, !dbg !85
+  %1035 = fsub float %1019, %800, !dbg !85
+  %1036 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %.not.i = icmp eq i32 %1036, 0, !dbg !86
+  br i1 %.not.i, label %1039, label %1037, !dbg !86
+1037:                                             ; preds = %947
+  %1038 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %822), !dbg !86
+  br label %__nv_rsqrtf.exit, !dbg !86
+1039:                                             ; preds = %947
+  %1040 = tail call float @llvm.nvvm.rsqrt.approx.f(float %822), !dbg !86
+  br label %__nv_rsqrtf.exit, !dbg !86
+__nv_rsqrtf.exit:                                 ; preds = %1037, %1039
+  %.0.i = phi float [ %1038, %1037 ], [ %1040, %1039 ], !dbg !86
+  %1041 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %1042 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %1043 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %1044 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %1045 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %1046 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %1047 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %1048 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %.not.i38 = icmp eq i32 %1048, 0, !dbg !86
+  br i1 %.not.i38, label %1051, label %1049, !dbg !86
+1049:                                             ; preds = %__nv_rsqrtf.exit
+  %1050 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %823), !dbg !86
+  br label %__nv_rsqrtf.exit40, !dbg !86
+1051:                                             ; preds = %__nv_rsqrtf.exit
+  %1052 = tail call float @llvm.nvvm.rsqrt.approx.f(float %823), !dbg !86
+  br label %__nv_rsqrtf.exit40, !dbg !86
+__nv_rsqrtf.exit40:                               ; preds = %1049, %1051
+  %.0.i39 = phi float [ %1050, %1049 ], [ %1052, %1051 ], !dbg !86
+  %1053 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %1054 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %1055 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %1056 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %1057 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %1058 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %1059 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %1060 = fmul float %1020, %.0.i, !dbg !87
+  %1061 = fmul float %1021, %.0.i, !dbg !87
+  %1062 = fmul float %1022, %.0.i, !dbg !87
+  %1063 = fmul float %1023, %.0.i, !dbg !87
+  %1064 = fmul float %1024, %.0.i, !dbg !87
+  %1065 = fmul float %1025, %.0.i, !dbg !87
+  %1066 = fmul float %1026, %.0.i, !dbg !87
+  %1067 = fmul float %1027, %.0.i, !dbg !87
+  %1068 = fmul float %1028, %.0.i39, !dbg !87
+  %1069 = fmul float %1029, %.0.i39, !dbg !87
+  %1070 = fmul float %1030, %.0.i39, !dbg !87
+  %1071 = fmul float %1031, %.0.i39, !dbg !87
+  %1072 = fmul float %1032, %.0.i39, !dbg !87
+  %1073 = fmul float %1033, %.0.i39, !dbg !87
+  %1074 = fmul float %1034, %.0.i39, !dbg !87
+  %1075 = fmul float %1035, %.0.i39, !dbg !87
+  %1076 = fmul float %1060, %933, !dbg !88
+  %1077 = fmul float %1061, %934, !dbg !88
+  %1078 = fmul float %1062, %935, !dbg !88
+  %1079 = fmul float %1063, %936, !dbg !88
+  %1080 = fmul float %1064, %942, !dbg !88
+  %1081 = fmul float %1065, %943, !dbg !88
+  %1082 = fmul float %1066, %944, !dbg !88
+  %1083 = fmul float %1067, %945, !dbg !88
+  %1084 = fmul float %1068, %933, !dbg !88
+  %1085 = fmul float %1069, %934, !dbg !88
+  %1086 = fmul float %1070, %935, !dbg !88
+  %1087 = fmul float %1071, %936, !dbg !88
+  %1088 = fmul float %1072, %942, !dbg !88
+  %1089 = fmul float %1073, %943, !dbg !88
+  %1090 = fmul float %1074, %944, !dbg !88
+  %1091 = fmul float %1075, %945, !dbg !88
+  %1092 = getelementptr i16, ptr addrspace(1) %5, i64 %878, !dbg !89
+  %1093 = getelementptr i16, ptr addrspace(1) %5, i64 %880, !dbg !89
+  %1094 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1076) #6, !dbg !90
+  %1095 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1077) #6, !dbg !90
+  %1096 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1078) #6, !dbg !90
+  %1097 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1079) #6, !dbg !90
+  %1098 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1080) #6, !dbg !90
+  %1099 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1081) #6, !dbg !90
+  %1100 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1082) #6, !dbg !90
+  %1101 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1083) #6, !dbg !90
+  %1102 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1084) #6, !dbg !90
+  %1103 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1085) #6, !dbg !90
+  %1104 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1086) #6, !dbg !90
+  %1105 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1087) #6, !dbg !90
+  %1106 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1088) #6, !dbg !90
+  %1107 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1089) #6, !dbg !90
+  %1108 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1090) #6, !dbg !90
+  %1109 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1091) #6, !dbg !90
+  %1110 = insertelement <2 x i16> undef, i16 %1094, i64 0, !dbg !90
+  %1111 = insertelement <2 x i16> %1110, i16 %1095, i64 1, !dbg !90
+  %1112 = bitcast <2 x i16> %1111 to i32, !dbg !90
+  %1113 = insertelement <2 x i16> undef, i16 %1096, i64 0, !dbg !90
+  %1114 = insertelement <2 x i16> %1113, i16 %1097, i64 1, !dbg !90
+  %1115 = bitcast <2 x i16> %1114 to i32, !dbg !90
+  %1116 = insertelement <2 x i16> undef, i16 %1098, i64 0, !dbg !90
+  %1117 = insertelement <2 x i16> %1116, i16 %1099, i64 1, !dbg !90
+  %1118 = bitcast <2 x i16> %1117 to i32, !dbg !90
+  %1119 = insertelement <2 x i16> undef, i16 %1100, i64 0, !dbg !90
+  %1120 = insertelement <2 x i16> %1119, i16 %1101, i64 1, !dbg !90
+  %1121 = bitcast <2 x i16> %1120 to i32, !dbg !90
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1112, i32 %1115, i32 %1118, i32 %1121, ptr addrspace(1) %1092, i1 true) #6, !dbg !90
+  %1122 = insertelement <2 x i16> undef, i16 %1102, i64 0, !dbg !90
+  %1123 = insertelement <2 x i16> %1122, i16 %1103, i64 1, !dbg !90
+  %1124 = bitcast <2 x i16> %1123 to i32, !dbg !90
+  %1125 = insertelement <2 x i16> undef, i16 %1104, i64 0, !dbg !90
+  %1126 = insertelement <2 x i16> %1125, i16 %1105, i64 1, !dbg !90
+  %1127 = bitcast <2 x i16> %1126 to i32, !dbg !90
+  %1128 = insertelement <2 x i16> undef, i16 %1106, i64 0, !dbg !90
+  %1129 = insertelement <2 x i16> %1128, i16 %1107, i64 1, !dbg !90
+  %1130 = bitcast <2 x i16> %1129 to i32, !dbg !90
+  %1131 = insertelement <2 x i16> undef, i16 %1108, i64 0, !dbg !90
+  %1132 = insertelement <2 x i16> %1131, i16 %1109, i64 1, !dbg !90
+  %1133 = bitcast <2 x i16> %1132 to i32, !dbg !90
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1124, i32 %1127, i32 %1130, i32 %1133, ptr addrspace(1) %1093, i1 true) #6, !dbg !90
+  %1134 = add nuw nsw i32 %825, 64, !dbg !69
+  %1135 = icmp ult i32 %825, 192, !dbg !69
+  br i1 %1135, label %824, label %1136, !dbg !69
+1136:                                             ; preds = %__nv_rsqrtf.exit40
+  ret void, !dbg !91
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #1
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "ccig6fki6p4lxrdmgg6eudahiexcvueeol2p4qp532pvve2y463y.py", directory: "/tmp/torchinductor_root/ci")
+!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 256}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 22, column: 44, scope: !7)
+!11 = !DILocation(line: 24, column: 33, scope: !7)
+!12 = !DILocation(line: 31, column: 36, scope: !7)
+!13 = !DILocation(line: 21, column: 28, scope: !7)
+!14 = !DILocation(line: 21, column: 33, scope: !7)
+!15 = !DILocation(line: 22, column: 23, scope: !7)
+!16 = !DILocation(line: 26, column: 30, scope: !7)
+!17 = !DILocation(line: 26, column: 35, scope: !7)
+!18 = !DILocation(line: 27, column: 18, scope: !7)
+!19 = !DILocation(line: 35, column: 44, scope: !7)
+!20 = !DILocation(line: 36, column: 44, scope: !7)
+!21 = !DILocation(line: 37, column: 22, scope: !7)
+!22 = !DILocation(line: 38, column: 22, scope: !7)
+!23 = !DILocation(line: 39, column: 36, scope: !7)
+!24 = !DILocation(line: 40, column: 40, scope: !7)
+!25 = !DILocation(line: 41, column: 44, scope: !7)
+!26 = !DILocation(line: 32, column: 27, scope: !7)
+!27 = !DILocation(line: 35, column: 40, scope: !7)
+!28 = !DILocation(line: 35, column: 34, scope: !7)
+!29 = !DILocation(line: 35, column: 50, scope: !7)
+!30 = !DILocation(line: 36, column: 40, scope: !7)
+!31 = !DILocation(line: 36, column: 34, scope: !7)
+!32 = !DILocation(line: 36, column: 50, scope: !7)
+!33 = !DILocation(line: 36, column: 101, scope: !7)
+!34 = !DILocation(line: 40, column: 55, scope: !7)
+!35 = !DILocation(line: 41, column: 40, scope: !7)
+!36 = !DILocation(line: 41, column: 34, scope: !7)
+!37 = !DILocation(line: 41, column: 52, scope: !7)
+!38 = !DILocation(line: 42, column: 22, scope: !7)
+!39 = !DILocation(line: 44, column: 22, scope: !7)
+!40 = !DILocation(line: 96, column: 20, scope: !41, inlinedAt: !43)
+!41 = distinct !DILexicalBlockFile(scope: !7, file: !42, discriminator: 0)
+!42 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!43 = !DILocation(line: 47, column: 41, scope: !41)
+!44 = !DILocation(line: 97, column: 26, scope: !41, inlinedAt: !43)
+!45 = !DILocation(line: 98, column: 30, scope: !41, inlinedAt: !43)
+!46 = !DILocation(line: 98, column: 22, scope: !41, inlinedAt: !43)
+!47 = !DILocation(line: 101, column: 30, scope: !41, inlinedAt: !43)
+!48 = !DILocation(line: 101, column: 22, scope: !41, inlinedAt: !43)
+!49 = !DILocation(line: 50, column: 50, scope: !7)
+!50 = !DILocation(line: 108, column: 21, scope: !51, inlinedAt: !52)
+!51 = distinct !DILexicalBlockFile(scope: !41, file: !42, discriminator: 0)
+!52 = !DILocation(line: 120, column: 46, scope: !51, inlinedAt: !53)
+!53 = !DILocation(line: 53, column: 44, scope: !51)
+!54 = !DILocation(line: 109, column: 28, scope: !51, inlinedAt: !52)
+!55 = !DILocation(line: 110, column: 39, scope: !51, inlinedAt: !52)
+!56 = !DILocation(line: 110, column: 60, scope: !51, inlinedAt: !52)
+!57 = !DILocation(line: 110, column: 49, scope: !51, inlinedAt: !52)
+!58 = !DILocation(line: 112, column: 25, scope: !51, inlinedAt: !52)
+!59 = !DILocation(line: 112, column: 17, scope: !51, inlinedAt: !52)
+!60 = !DILocation(line: 113, column: 15, scope: !51, inlinedAt: !52)
+!61 = !DILocation(line: 113, column: 30, scope: !51, inlinedAt: !52)
+!62 = !DILocation(line: 113, column: 38, scope: !51, inlinedAt: !52)
+!63 = !DILocation(line: 113, column: 49, scope: !51, inlinedAt: !52)
+!64 = !DILocation(line: 113, column: 22, scope: !51, inlinedAt: !52)
+!65 = !DILocation(line: 120, column: 46, scope: !41, inlinedAt: !66)
+!66 = !DILocation(line: 53, column: 44, scope: !41)
+!67 = !DILocation(line: 75, column: 24, scope: !7)
+!68 = !DILocation(line: 77, column: 24, scope: !7)
+!69 = !DILocation(line: 58, column: 36, scope: !7)
+!70 = !DILocation(line: 59, column: 27, scope: !7)
+!71 = !DILocation(line: 62, column: 41, scope: !7)
+!72 = !DILocation(line: 62, column: 35, scope: !7)
+!73 = !DILocation(line: 62, column: 51, scope: !7)
+!74 = !DILocation(line: 63, column: 41, scope: !7)
+!75 = !DILocation(line: 63, column: 35, scope: !7)
+!76 = !DILocation(line: 63, column: 51, scope: !7)
+!77 = !DILocation(line: 63, column: 103, scope: !7)
+!78 = !DILocation(line: 64, column: 35, scope: !7)
+!79 = !DILocation(line: 64, column: 40, scope: !7)
+!80 = !DILocation(line: 68, column: 57, scope: !7)
+!81 = !DILocation(line: 69, column: 35, scope: !7)
+!82 = !DILocation(line: 69, column: 54, scope: !7)
+!83 = !DILocation(line: 70, column: 24, scope: !7)
+!84 = !DILocation(line: 72, column: 24, scope: !7)
+!85 = !DILocation(line: 73, column: 24, scope: !7)
+!86 = !DILocation(line: 78, column: 30, scope: !7)
+!87 = !DILocation(line: 79, column: 24, scope: !7)
+!88 = !DILocation(line: 80, column: 24, scope: !7)
+!89 = !DILocation(line: 82, column: 29, scope: !7)
+!90 = !DILocation(line: 82, column: 52, scope: !7)
+!91 = !DILocation(line: 58, column: 4, scope: !7)

.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.ptx ADDED Viewed

	@@ -0,0 +1,2004 @@

+//
+// Generated by LLVM NVPTX Back-End
+//
+.version 8.2
+.target sm_89
+.address_size 64
+	// .globl	triton__0d1d2d3d4d5d6de7de
+.extern .func __assertfail
+(
+	.param .b64 __assertfail_param_0,
+	.param .b64 __assertfail_param_1,
+	.param .b32 __assertfail_param_2,
+	.param .b64 __assertfail_param_3,
+	.param .b64 __assertfail_param_4
+)
+;
+.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
+.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
+.extern .shared .align 1 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+.visible .entry triton__0d1d2d3d4d5d6de7de(
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
+	.param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
+	.param .u32 triton__0d1d2d3d4d5d6de7de_param_7
+)
+.maxntid 256, 1, 1
+{
+	.reg .pred 	%p<157>;
+	.reg .b16 	%rs<49>;
+	.reg .b32 	%r<474>;
+	.reg .f32 	%f<678>;
+	.reg .b64 	%rd<118>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+	ld.param.u64 	%rd16, [triton__0d1d2d3d4d5d6de7de_param_5];
+	ld.param.u64 	%rd15, [triton__0d1d2d3d4d5d6de7de_param_4];
+	ld.param.u64 	%rd14, [triton__0d1d2d3d4d5d6de7de_param_3];
+	ld.param.u64 	%rd52, [triton__0d1d2d3d4d5d6de7de_param_0];
+	ld.param.u64 	%rd53, [triton__0d1d2d3d4d5d6de7de_param_1];
+$L__tmp0:
+	.loc	1 22 44
+	mov.u32 	%r12, %tid.x;
+	ld.param.u64 	%rd54, [triton__0d1d2d3d4d5d6de7de_param_2];
+	bfe.u32 	%r1, %r12, 3, 5;
+	and.b32  	%r2, %r12, 63;
+	.loc	1 24 33
+	shl.b32 	%r13, %r12, 3;
+	and.b32  	%r3, %r13, 56;
+	.loc	1 31 36
+	shr.u32 	%r4, %r12, 6;
+	.loc	1 21 28
+	mov.u32 %r10, %ctaid.x;
+	.loc	1 21 33
+	shl.b32 	%r14, %r10, 6;
+	.loc	1 22 23
+	or.b32  	%r15, %r14, %r1;
+	or.b32  	%r16, %r15, 32;
+	or.b32  	%r17, %r14, %r2;
+	.loc	1 26 30
+	mul.wide.s32 	%rd55, %r15, 8;
+	add.s64 	%rd18, %rd52, %rd55;
+	add.s64 	%rd34, %rd18, 256;
+	mul.wide.s32 	%rd56, %r17, 8;
+	add.s64 	%rd50, %rd52, %rd56;
+	mov.pred 	%p1, -1;
+	.loc	1 26 35
+	mov.u64 %rd17, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd17 }, [ %rd18 + 0 ];
+	mov.u64 %rd19, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd19 }, [ %rd18 + 0 ];
+	mov.u64 %rd21, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd21 }, [ %rd18 + 0 ];
+	mov.u64 %rd23, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd23 }, [ %rd18 + 0 ];
+	mov.u64 %rd25, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd25 }, [ %rd18 + 0 ];
+	mov.u64 %rd27, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd27 }, [ %rd18 + 0 ];
+	mov.u64 %rd29, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd29 }, [ %rd18 + 0 ];
+	mov.u64 %rd31, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd31 }, [ %rd18 + 0 ];
+	mov.u64 %rd33, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd33 }, [ %rd34 + 0 ];
+	mov.u64 %rd35, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd35 }, [ %rd34 + 0 ];
+	mov.u64 %rd37, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd37 }, [ %rd34 + 0 ];
+	mov.u64 %rd39, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd39 }, [ %rd34 + 0 ];
+	mov.u64 %rd41, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd41 }, [ %rd34 + 0 ];
+	mov.u64 %rd43, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd43 }, [ %rd34 + 0 ];
+	mov.u64 %rd45, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd45 }, [ %rd34 + 0 ];
+	mov.u64 %rd47, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd47 }, [ %rd34 + 0 ];
+	mov.u64 %rd49, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd49 }, [ %rd50 + 0 ];
+	.loc	1 27 18
+	bfe.s32 	%r18, %r10, 25, 1;
+	shr.u32 	%r19, %r18, 23;
+	add.s32 	%r20, %r15, %r19;
+	and.b32  	%r21, %r20, 16776704;
+	sub.s32 	%r22, %r15, %r21;
+	add.s32 	%r23, %r16, %r19;
+	and.b32  	%r24, %r23, 16776704;
+	sub.s32 	%r25, %r16, %r24;
+	.loc	1 35 44
+	shl.b32 	%r26, %r22, 8;
+	shl.b32 	%r27, %r25, 8;
+	.loc	1 37 22
+	add.s64 	%rd57, %rd49, 50257;
+	.loc	1 38 22
+	setp.lt.s64 	%p18, %rd17, 0;
+	setp.lt.s64 	%p19, %rd33, 0;
+	setp.lt.s64 	%p20, %rd49, 0;
+	.loc	1 39 36
+	selp.b64 	%rd1, %rd57, %rd49, %p20;
+	.loc	1 41 44
+	shl.b64 	%rd58, %rd17, 8;
+	add.s64 	%rd59, %rd58, 12865792;
+	selp.b64 	%rd60, %rd59, %rd58, %p18;
+	shl.b64 	%rd61, %rd33, 8;
+	add.s64 	%rd62, %rd61, 12865792;
+	selp.b64 	%rd63, %rd62, %rd61, %p19;
+	.loc	1 31 36
+	and.b32  	%r28, %r12, 7;
+	mul.wide.u32 	%rd2, %r28, 32;
+	shl.b64 	%rd64, %rd63, 2;
+	or.b64  	%rd65, %rd2, %rd64;
+	add.s64 	%rd3, %rd53, %rd65;
+	shl.b64 	%rd66, %rd60, 2;
+	or.b64  	%rd67, %rd2, %rd66;
+	add.s64 	%rd4, %rd53, %rd67;
+	or.b32  	%r29, %r27, %r3;
+	mul.wide.s32 	%rd68, %r29, 4;
+	add.s64 	%rd5, %rd54, %rd68;
+	or.b32  	%r30, %r26, %r3;
+	mul.wide.s32 	%rd69, %r30, 4;
+	add.s64 	%rd6, %rd54, %rd69;
+	shl.b32 	%r31, %r10, 14;
+	shl.b32 	%r32, %r1, 8;
+	or.b32  	%r33, %r31, %r32;
+	or.b32  	%r5, %r33, %r3;
+	mov.f32 	%f614, 0f00000000;
+	mov.u64 	%rd116, 0;
+	mov.b32 	%r472, -64;
+	mov.f32 	%f615, %f614;
+	mov.f32 	%f616, %f614;
+	mov.f32 	%f617, %f614;
+	mov.f32 	%f618, %f614;
+	mov.f32 	%f619, %f614;
+	mov.f32 	%f620, %f614;
+	mov.f32 	%f621, %f614;
+	mov.f32 	%f622, %f614;
+	mov.f32 	%f623, %f614;
+	mov.f32 	%f624, %f614;
+	mov.f32 	%f625, %f614;
+	mov.f32 	%f626, %f614;
+	mov.f32 	%f627, %f614;
+	mov.f32 	%f628, %f614;
+	mov.f32 	%f629, %f614;
+	mov.f32 	%f630, %f614;
+	mov.f32 	%f631, %f614;
+	mov.f32 	%f632, %f614;
+	mov.f32 	%f633, %f614;
+	mov.f32 	%f634, %f614;
+	mov.f32 	%f635, %f614;
+	mov.f32 	%f636, %f614;
+	mov.f32 	%f637, %f614;
+	mov.f32 	%f638, %f614;
+	mov.f32 	%f639, %f614;
+	mov.f32 	%f640, %f614;
+	mov.f32 	%f641, %f614;
+	mov.f32 	%f642, %f614;
+	mov.f32 	%f643, %f614;
+	mov.f32 	%f644, %f614;
+	mov.f32 	%f645, %f614;
+	mov.f32 	%f646, %f614;
+	mov.f32 	%f647, %f614;
+	mov.f32 	%f648, %f614;
+	mov.f32 	%f649, %f614;
+	mov.f32 	%f650, %f614;
+	mov.f32 	%f651, %f614;
+	mov.f32 	%f652, %f614;
+	mov.f32 	%f653, %f614;
+	mov.f32 	%f654, %f614;
+	mov.f32 	%f655, %f614;
+	mov.f32 	%f656, %f614;
+	mov.f32 	%f657, %f614;
+	mov.f32 	%f658, %f614;
+	mov.f32 	%f659, %f614;
+	mov.f32 	%f660, %f614;
+	mov.f32 	%f661, %f614;
+	mov.f32 	%f662, %f614;
+	mov.f32 	%f663, %f614;
+	mov.f32 	%f664, %f614;
+	mov.f32 	%f665, %f614;
+	mov.f32 	%f666, %f614;
+	mov.f32 	%f667, %f614;
+	mov.f32 	%f668, %f614;
+	mov.f32 	%f669, %f614;
+	mov.f32 	%f670, %f614;
+	mov.f32 	%f671, %f614;
+	mov.f32 	%f672, %f614;
+	mov.f32 	%f673, %f614;
+	mov.f32 	%f674, %f614;
+	mov.f32 	%f675, %f614;
+	mov.f32 	%f676, %f614;
+	mov.f32 	%f677, %f614;
+	bra.uni 	$L__BB0_1;
+$L__BB0_3:
+	.loc	1 41 40
+	add.s64 	%rd85, %rd4, %rd116;
+	.loc	1 41 34
+	add.s64 	%rd86, %rd85, 16;
+	add.s64 	%rd87, %rd3, %rd116;
+	.loc	1 41 52
+	add.s64 	%rd88, %rd87, 16;
+	mov.u32 %r102, 0x0;
+	mov.u32 %r103, 0x0;
+	mov.u32 %r104, 0x0;
+	mov.u32 %r105, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r102, %r103, %r104, %r105 }, [ %rd85 + 0 ];
+	@!%p1 mov.u32 %r102, %r411;
+	@!%p1 mov.u32 %r103, %r411;
+	@!%p1 mov.u32 %r104, %r411;
+	@!%p1 mov.u32 %r105, %r411;
+	mov.b32 	%f206, %r102;
+	mov.b32 	%f207, %r103;
+	mov.b32 	%f208, %r104;
+	mov.b32 	%f209, %r105;
+	mov.u32 %r110, 0x0;
+	mov.u32 %r111, 0x0;
+	mov.u32 %r112, 0x0;
+	mov.u32 %r113, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r110, %r111, %r112, %r113 }, [ %rd86 + 0 ];
+	@!%p1 mov.u32 %r110, %r411;
+	@!%p1 mov.u32 %r111, %r411;
+	@!%p1 mov.u32 %r112, %r411;
+	@!%p1 mov.u32 %r113, %r411;
+	mov.b32 	%f210, %r110;
+	mov.b32 	%f211, %r111;
+	mov.b32 	%f212, %r112;
+	mov.b32 	%f213, %r113;
+	mov.u32 %r118, 0x0;
+	mov.u32 %r119, 0x0;
+	mov.u32 %r120, 0x0;
+	mov.u32 %r121, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r118, %r119, %r120, %r121 }, [ %rd87 + 0 ];
+	@!%p1 mov.u32 %r118, %r411;
+	@!%p1 mov.u32 %r119, %r411;
+	@!%p1 mov.u32 %r120, %r411;
+	@!%p1 mov.u32 %r121, %r411;
+	mov.b32 	%f214, %r118;
+	mov.b32 	%f215, %r119;
+	mov.b32 	%f216, %r120;
+	mov.b32 	%f217, %r121;
+	mov.u32 %r126, 0x0;
+	mov.u32 %r127, 0x0;
+	mov.u32 %r128, 0x0;
+	mov.u32 %r129, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r126, %r127, %r128, %r129 }, [ %rd88 + 0 ];
+	@!%p1 mov.u32 %r126, %r411;
+	@!%p1 mov.u32 %r127, %r411;
+	@!%p1 mov.u32 %r128, %r411;
+	@!%p1 mov.u32 %r129, %r411;
+	mov.b32 	%f218, %r126;
+	mov.b32 	%f219, %r127;
+	mov.b32 	%f220, %r128;
+	mov.b32 	%f221, %r129;
+	.loc	1 42 22
+	add.f32 	%f222, %f65, %f206;
+	add.f32 	%f223, %f66, %f207;
+	add.f32 	%f224, %f67, %f208;
+	add.f32 	%f225, %f68, %f209;
+	add.f32 	%f226, %f69, %f210;
+	add.f32 	%f227, %f70, %f211;
+	add.f32 	%f228, %f71, %f212;
+	add.f32 	%f229, %f72, %f213;
+	add.f32 	%f230, %f73, %f214;
+	add.f32 	%f231, %f74, %f215;
+	add.f32 	%f232, %f75, %f216;
+	add.f32 	%f233, %f76, %f217;
+	add.f32 	%f234, %f77, %f218;
+	add.f32 	%f235, %f78, %f219;
+	add.f32 	%f236, %f79, %f220;
+	add.f32 	%f237, %f80, %f221;
+	.loc	1 44 22
+	add.f32 	%f238, %f81, %f222;
+	add.f32 	%f239, %f82, %f223;
+	add.f32 	%f240, %f83, %f224;
+	add.f32 	%f241, %f84, %f225;
+	add.f32 	%f242, %f85, %f226;
+	add.f32 	%f243, %f86, %f227;
+	add.f32 	%f244, %f87, %f228;
+	add.f32 	%f245, %f88, %f229;
+	add.f32 	%f246, %f89, %f230;
+	add.f32 	%f247, %f90, %f231;
+	add.f32 	%f248, %f91, %f232;
+	add.f32 	%f249, %f92, %f233;
+	add.f32 	%f250, %f93, %f234;
+	add.f32 	%f251, %f94, %f235;
+	add.f32 	%f252, %f95, %f236;
+	add.f32 	%f253, %f96, %f237;
+$L__tmp1:
+	.loc	2 96 20
+	sub.f32 	%f254, %f238, %f662;
+	sub.f32 	%f255, %f239, %f663;
+	sub.f32 	%f256, %f240, %f664;
+	sub.f32 	%f257, %f241, %f665;
+	sub.f32 	%f258, %f242, %f666;
+	sub.f32 	%f259, %f243, %f667;
+	sub.f32 	%f260, %f244, %f668;
+	sub.f32 	%f261, %f245, %f669;
+	sub.f32 	%f262, %f246, %f670;
+	sub.f32 	%f263, %f247, %f671;
+	sub.f32 	%f264, %f248, %f672;
+	sub.f32 	%f265, %f249, %f673;
+	sub.f32 	%f266, %f250, %f674;
+	sub.f32 	%f267, %f251, %f675;
+	sub.f32 	%f268, %f252, %f676;
+	sub.f32 	%f269, %f253, %f677;
+	.loc	2 97 26
+	add.f32 	%f614, %f614, 0f3F800000;
+	add.f32 	%f615, %f615, 0f3F800000;
+	add.f32 	%f616, %f616, 0f3F800000;
+	add.f32 	%f617, %f617, 0f3F800000;
+	add.f32 	%f618, %f618, 0f3F800000;
+	add.f32 	%f619, %f619, 0f3F800000;
+	add.f32 	%f620, %f620, 0f3F800000;
+	add.f32 	%f621, %f621, 0f3F800000;
+	add.f32 	%f622, %f622, 0f3F800000;
+	add.f32 	%f623, %f623, 0f3F800000;
+	add.f32 	%f624, %f624, 0f3F800000;
+	add.f32 	%f625, %f625, 0f3F800000;
+	add.f32 	%f626, %f626, 0f3F800000;
+	add.f32 	%f627, %f627, 0f3F800000;
+	add.f32 	%f628, %f628, 0f3F800000;
+	add.f32 	%f629, %f629, 0f3F800000;
+	add.f32 	%f630, %f630, 0f3F800000;
+	add.f32 	%f631, %f631, 0f3F800000;
+	add.f32 	%f632, %f632, 0f3F800000;
+	add.f32 	%f633, %f633, 0f3F800000;
+	add.f32 	%f634, %f634, 0f3F800000;
+	add.f32 	%f635, %f635, 0f3F800000;
+	add.f32 	%f636, %f636, 0f3F800000;
+	add.f32 	%f637, %f637, 0f3F800000;
+	add.f32 	%f638, %f638, 0f3F800000;
+	add.f32 	%f639, %f639, 0f3F800000;
+	add.f32 	%f640, %f640, 0f3F800000;
+	add.f32 	%f641, %f641, 0f3F800000;
+	add.f32 	%f642, %f642, 0f3F800000;
+	add.f32 	%f643, %f643, 0f3F800000;
+	add.f32 	%f644, %f644, 0f3F800000;
+	add.f32 	%f645, %f645, 0f3F800000;
+	.loc	2 98 30
+	mov.b32 	%r135, %f254;
+	mov.b32 	%r136, %f614;
+	div.full.f32 %r134, %r135, %r136;
+	mov.b32 	%f270, %r134;
+	mov.b32 	%r138, %f255;
+	mov.b32 	%r139, %f615;
+	div.full.f32 %r137, %r138, %r139;
+	mov.b32 	%f271, %r137;
+	mov.b32 	%r141, %f256;
+	mov.b32 	%r142, %f616;
+	div.full.f32 %r140, %r141, %r142;
+	mov.b32 	%f272, %r140;
+	mov.b32 	%r144, %f257;
+	mov.b32 	%r145, %f617;
+	div.full.f32 %r143, %r144, %r145;
+	mov.b32 	%f273, %r143;
+	mov.b32 	%r147, %f258;
+	mov.b32 	%r148, %f618;
+	div.full.f32 %r146, %r147, %r148;
+	mov.b32 	%f274, %r146;
+	mov.b32 	%r150, %f259;
+	mov.b32 	%r151, %f619;
+	div.full.f32 %r149, %r150, %r151;
+	mov.b32 	%f275, %r149;
+	mov.b32 	%r153, %f260;
+	mov.b32 	%r154, %f620;
+	div.full.f32 %r152, %r153, %r154;
+	mov.b32 	%f276, %r152;
+	mov.b32 	%r156, %f261;
+	mov.b32 	%r157, %f621;
+	div.full.f32 %r155, %r156, %r157;
+	mov.b32 	%f277, %r155;
+	mov.b32 	%r159, %f262;
+	mov.b32 	%r160, %f622;
+	div.full.f32 %r158, %r159, %r160;
+	mov.b32 	%f278, %r158;
+	mov.b32 	%r162, %f263;
+	mov.b32 	%r163, %f623;
+	div.full.f32 %r161, %r162, %r163;
+	mov.b32 	%f279, %r161;
+	mov.b32 	%r165, %f264;
+	mov.b32 	%r166, %f624;
+	div.full.f32 %r164, %r165, %r166;
+	mov.b32 	%f280, %r164;
+	mov.b32 	%r168, %f265;
+	mov.b32 	%r169, %f625;
+	div.full.f32 %r167, %r168, %r169;
+	mov.b32 	%f281, %r167;
+	mov.b32 	%r171, %f266;
+	mov.b32 	%r172, %f626;
+	div.full.f32 %r170, %r171, %r172;
+	mov.b32 	%f282, %r170;
+	mov.b32 	%r174, %f267;
+	mov.b32 	%r175, %f627;
+	div.full.f32 %r173, %r174, %r175;
+	mov.b32 	%f283, %r173;
+	mov.b32 	%r177, %f268;
+	mov.b32 	%r178, %f628;
+	div.full.f32 %r176, %r177, %r178;
+	mov.b32 	%f284, %r176;
+	mov.b32 	%r180, %f269;
+	mov.b32 	%r181, %f629;
+	div.full.f32 %r179, %r180, %r181;
+	mov.b32 	%f285, %r179;
+	.loc	2 98 22
+	add.f32 	%f662, %f662, %f270;
+	add.f32 	%f663, %f663, %f271;
+	add.f32 	%f664, %f664, %f272;
+	add.f32 	%f665, %f665, %f273;
+	add.f32 	%f666, %f666, %f274;
+	add.f32 	%f667, %f667, %f275;
+	add.f32 	%f668, %f668, %f276;
+	add.f32 	%f669, %f669, %f277;
+	add.f32 	%f670, %f670, %f278;
+	add.f32 	%f671, %f671, %f279;
+	add.f32 	%f672, %f672, %f280;
+	add.f32 	%f673, %f673, %f281;
+	add.f32 	%f674, %f674, %f282;
+	add.f32 	%f675, %f675, %f283;
+	add.f32 	%f676, %f676, %f284;
+	add.f32 	%f677, %f677, %f285;
+	.loc	2 101 30
+	sub.f32 	%f286, %f238, %f662;
+	sub.f32 	%f287, %f239, %f663;
+	sub.f32 	%f288, %f240, %f664;
+	sub.f32 	%f289, %f241, %f665;
+	sub.f32 	%f290, %f242, %f666;
+	sub.f32 	%f291, %f243, %f667;
+	sub.f32 	%f292, %f244, %f668;
+	sub.f32 	%f293, %f245, %f669;
+	sub.f32 	%f294, %f246, %f670;
+	sub.f32 	%f295, %f247, %f671;
+	sub.f32 	%f296, %f248, %f672;
+	sub.f32 	%f297, %f249, %f673;
+	sub.f32 	%f298, %f250, %f674;
+	sub.f32 	%f299, %f251, %f675;
+	sub.f32 	%f300, %f252, %f676;
+	sub.f32 	%f301, %f253, %f677;
+$L__tmp2:
+	.loc	1 50 50
+	fma.rn.f32 	%f646, %f254, %f286, %f646;
+	fma.rn.f32 	%f647, %f255, %f287, %f647;
+	fma.rn.f32 	%f648, %f256, %f288, %f648;
+	fma.rn.f32 	%f649, %f257, %f289, %f649;
+	fma.rn.f32 	%f650, %f258, %f290, %f650;
+	fma.rn.f32 	%f651, %f259, %f291, %f651;
+	fma.rn.f32 	%f652, %f260, %f292, %f652;
+	fma.rn.f32 	%f653, %f261, %f293, %f653;
+	fma.rn.f32 	%f654, %f262, %f294, %f654;
+	fma.rn.f32 	%f655, %f263, %f295, %f655;
+	fma.rn.f32 	%f656, %f264, %f296, %f656;
+	fma.rn.f32 	%f657, %f265, %f297, %f657;
+	fma.rn.f32 	%f658, %f266, %f298, %f658;
+	fma.rn.f32 	%f659, %f267, %f299, %f659;
+	fma.rn.f32 	%f660, %f268, %f300, %f660;
+	fma.rn.f32 	%f661, %f269, %f301, %f661;
+	.loc	1 31 36
+	add.s64 	%rd116, %rd116, 256;
+	add.s32 	%r472, %r472, 64;
+	setp.lt.u32 	%p72, %r472, 192;
+	@%p72 bra 	$L__BB0_1;
+	bra.uni 	$L__BB0_4;
+$L__BB0_1:
+	.loc	1 40 40
+	setp.lt.u64 	%p51, %rd1, 50257;
+	.loc	1 35 34
+	add.s64 	%rd70, %rd6, %rd116;
+	add.s64 	%rd71, %rd70, 16;
+	add.s64 	%rd72, %rd5, %rd116;
+	.loc	1 35 50
+	add.s64 	%rd73, %rd72, 16;
+	mov.b32 	%r411, 0;
+	mov.u32 %r34, 0x0;
+	mov.u32 %r35, 0x0;
+	mov.u32 %r36, 0x0;
+	mov.u32 %r37, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd70 + 0 ];
+	@!%p1 mov.u32 %r34, %r411;
+	@!%p1 mov.u32 %r35, %r411;
+	@!%p1 mov.u32 %r36, %r411;
+	@!%p1 mov.u32 %r37, %r411;
+	mov.b32 	%f65, %r34;
+	mov.b32 	%f66, %r35;
+	mov.b32 	%f67, %r36;
+	mov.b32 	%f68, %r37;
+	mov.u32 %r42, 0x0;
+	mov.u32 %r43, 0x0;
+	mov.u32 %r44, 0x0;
+	mov.u32 %r45, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd71 + 0 ];
+	@!%p1 mov.u32 %r42, %r411;
+	@!%p1 mov.u32 %r43, %r411;
+	@!%p1 mov.u32 %r44, %r411;
+	@!%p1 mov.u32 %r45, %r411;
+	mov.b32 	%f69, %r42;
+	mov.b32 	%f70, %r43;
+	mov.b32 	%f71, %r44;
+	mov.b32 	%f72, %r45;
+	mov.u32 %r50, 0x0;
+	mov.u32 %r51, 0x0;
+	mov.u32 %r52, 0x0;
+	mov.u32 %r53, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r50, %r51, %r52, %r53 }, [ %rd72 + 0 ];
+	@!%p1 mov.u32 %r50, %r411;
+	@!%p1 mov.u32 %r51, %r411;
+	@!%p1 mov.u32 %r52, %r411;
+	@!%p1 mov.u32 %r53, %r411;
+	mov.b32 	%f73, %r50;
+	mov.b32 	%f74, %r51;
+	mov.b32 	%f75, %r52;
+	mov.b32 	%f76, %r53;
+	mov.u32 %r58, 0x0;
+	mov.u32 %r59, 0x0;
+	mov.u32 %r60, 0x0;
+	mov.u32 %r61, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r58, %r59, %r60, %r61 }, [ %rd73 + 0 ];
+	@!%p1 mov.u32 %r58, %r411;
+	@!%p1 mov.u32 %r59, %r411;
+	@!%p1 mov.u32 %r60, %r411;
+	@!%p1 mov.u32 %r61, %r411;
+	mov.b32 	%f77, %r58;
+	mov.b32 	%f78, %r59;
+	mov.b32 	%f79, %r60;
+	mov.b32 	%f80, %r61;
+	.loc	1 36 40
+	add.s32 	%r98, %r5, %r472;
+	add.s32 	%r99, %r98, 64;
+	.loc	1 36 34
+	add.s32 	%r100, %r98, 8256;
+	mul.wide.s32 	%rd76, %r99, 2;
+	add.s64 	%rd74, %rd14, %rd76;
+	mul.wide.s32 	%rd77, %r100, 2;
+	add.s64 	%rd75, %rd14, %rd77;
+	.loc	1 36 50
+	mov.u32 %r66, 0x0;
+	mov.u32 %r67, 0x0;
+	mov.u32 %r68, 0x0;
+	mov.u32 %r69, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r66, %r67, %r68, %r69 }, [ %rd74 + 0 ];
+	@!%p1 mov.u32 %r66, %r411;
+	@!%p1 mov.u32 %r67, %r411;
+	@!%p1 mov.u32 %r68, %r411;
+	@!%p1 mov.u32 %r69, %r411;
+	cvt.u16.u32 	%rs1, %r66;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r66; }
+	cvt.u16.u32 	%rs3, %r67;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r67; }
+	cvt.u16.u32 	%rs5, %r68;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r68; }
+	cvt.u16.u32 	%rs7, %r69;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r69; }
+	mov.u32 %r74, 0x0;
+	mov.u32 %r75, 0x0;
+	mov.u32 %r76, 0x0;
+	mov.u32 %r77, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r74, %r75, %r76, %r77 }, [ %rd75 + 0 ];
+	@!%p1 mov.u32 %r74, %r411;
+	@!%p1 mov.u32 %r75, %r411;
+	@!%p1 mov.u32 %r76, %r411;
+	@!%p1 mov.u32 %r77, %r411;
+	cvt.u16.u32 	%rs9, %r74;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r74; }
+	cvt.u16.u32 	%rs11, %r75;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r75; }
+	cvt.u16.u32 	%rs13, %r76;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r76; }
+	cvt.u16.u32 	%rs15, %r77;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r77; }
+	.loc	1 36 101
+	cvt.f32.bf16 %r82, %rs1;
+	mov.b32 	%f81, %r82;
+	cvt.f32.bf16 %r83, %rs2;
+	mov.b32 	%f82, %r83;
+	cvt.f32.bf16 %r84, %rs3;
+	mov.b32 	%f83, %r84;
+	cvt.f32.bf16 %r85, %rs4;
+	mov.b32 	%f84, %r85;
+	cvt.f32.bf16 %r86, %rs5;
+	mov.b32 	%f85, %r86;
+	cvt.f32.bf16 %r87, %rs6;
+	mov.b32 	%f86, %r87;
+	cvt.f32.bf16 %r88, %rs7;
+	mov.b32 	%f87, %r88;
+	cvt.f32.bf16 %r89, %rs8;
+	mov.b32 	%f88, %r89;
+	cvt.f32.bf16 %r90, %rs9;
+	mov.b32 	%f89, %r90;
+	cvt.f32.bf16 %r91, %rs10;
+	mov.b32 	%f90, %r91;
+	cvt.f32.bf16 %r92, %rs11;
+	mov.b32 	%f91, %r92;
+	cvt.f32.bf16 %r93, %rs12;
+	mov.b32 	%f92, %r93;
+	cvt.f32.bf16 %r94, %rs13;
+	mov.b32 	%f93, %r94;
+	cvt.f32.bf16 %r95, %rs14;
+	mov.b32 	%f94, %r95;
+	cvt.f32.bf16 %r96, %rs15;
+	mov.b32 	%f95, %r96;
+	cvt.f32.bf16 %r97, %rs16;
+	mov.b32 	%f96, %r97;
+	mov.b32 	%r471, 883;
+	mov.u64 	%rd115, 1;
+	.loc	1 40 55
+	@%p51 bra 	$L__BB0_3;
+	mov.u64 	%rd78, assertMessage_0;
+	cvta.global.u64 	%rd79, %rd78;
+	mov.u64 	%rd80, assertFile_0;
+	cvta.global.u64 	%rd81, %rd80;
+	mov.u64 	%rd82, assertFunc_0;
+	cvta.global.u64 	%rd83, %rd82;
+	{ // callseq 6, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd79;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd81;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r471;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd83;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd115;
+	call.uni
+	__assertfail,
+	(
+	param0,
+	param1,
+	param2,
+	param3,
+	param4
+	);
+	} // callseq 6
+	bra.uni 	$L__BB0_3;
+$L__BB0_4:
+	.loc	1 31 36
+	and.b32  	%r291, %r4, 3;
+	mad.lo.s32 	%r292, %r291, 72, %r2;
+	shl.b32 	%r293, %r292, 2;
+	mov.u32 	%r294, global_smem;
+	add.s32 	%r295, %r294, %r293;
+	st.shared.f32 	[%r295], %f630;
+	st.shared.f32 	[%r295+1152], %f631;
+	st.shared.f32 	[%r295+2304], %f632;
+	st.shared.f32 	[%r295+3456], %f633;
+	st.shared.f32 	[%r295+4608], %f634;
+	st.shared.f32 	[%r295+5760], %f635;
+	st.shared.f32 	[%r295+6912], %f636;
+	st.shared.f32 	[%r295+8064], %f637;
+	bar.sync 	0;
+	mad.lo.s32 	%r296, %r1, 72, %r3;
+	shl.b32 	%r297, %r296, 2;
+	add.s32 	%r298, %r294, %r297;
+	ld.shared.v4.f32 	{%f302, %f303, %f304, %f305}, [%r298];
+	ld.shared.v4.f32 	{%f306, %f307, %f308, %f309}, [%r298+16];
+	bar.sync 	0;
+	st.shared.f32 	[%r295], %f638;
+	st.shared.f32 	[%r295+1152], %f639;
+	st.shared.f32 	[%r295+2304], %f640;
+	st.shared.f32 	[%r295+3456], %f641;
+	st.shared.f32 	[%r295+4608], %f642;
+	st.shared.f32 	[%r295+5760], %f643;
+	st.shared.f32 	[%r295+6912], %f644;
+	st.shared.f32 	[%r295+8064], %f645;
+	bar.sync 	0;
+	ld.shared.v4.f32 	{%f310, %f311, %f312, %f313}, [%r298];
+	ld.shared.v4.f32 	{%f314, %f315, %f316, %f317}, [%r298+16];
+$L__tmp3:
+	.loc	2 108 21
+	sub.f32 	%f318, %f663, %f662;
+	.loc	2 109 28
+	add.f32 	%f319, %f302, %f303;
+	.loc	2 110 39
+	setp.eq.f32 	%p73, %f319, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r183, %f303;
+	mov.b32 	%r184, %f319;
+	div.full.f32 %r182, %r183, %r184;
+	mov.b32 	%f320, %r182;
+	.loc	2 110 49
+	selp.f32 	%f321, 0f00000000, %f320, %p73;
+	.loc	2 112 17
+	fma.rn.f32 	%f322, %f318, %f321, %f662;
+	.loc	2 113 15
+	add.f32 	%f323, %f646, %f647;
+	.loc	2 113 30
+	mul.f32 	%f324, %f318, %f318;
+	.loc	2 113 38
+	mul.f32 	%f325, %f324, %f302;
+	.loc	2 113 22
+	fma.rn.f32 	%f326, %f325, %f321, %f323;
+	.loc	2 108 21
+	sub.f32 	%f327, %f664, %f322;
+	.loc	2 109 28
+	add.f32 	%f328, %f304, %f319;
+	.loc	2 110 39
+	setp.eq.f32 	%p74, %f328, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r187, %f328;
+	mov.b32 	%r186, %f304;
+	div.full.f32 %r185, %r186, %r187;
+	mov.b32 	%f329, %r185;
+	.loc	2 110 49
+	selp.f32 	%f330, 0f00000000, %f329, %p74;
+	.loc	2 112 17
+	fma.rn.f32 	%f331, %f330, %f327, %f322;
+	.loc	2 113 15
+	add.f32 	%f332, %f648, %f326;
+	.loc	2 113 30
+	mul.f32 	%f333, %f327, %f327;
+	.loc	2 113 38
+	mul.f32 	%f334, %f319, %f333;
+	.loc	2 113 22
+	fma.rn.f32 	%f335, %f330, %f334, %f332;
+	.loc	2 108 21
+	sub.f32 	%f336, %f665, %f331;
+	.loc	2 109 28
+	add.f32 	%f337, %f305, %f328;
+	.loc	2 110 39
+	setp.eq.f32 	%p75, %f337, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r190, %f337;
+	mov.b32 	%r189, %f305;
+	div.full.f32 %r188, %r189, %r190;
+	mov.b32 	%f338, %r188;
+	.loc	2 110 49
+	selp.f32 	%f339, 0f00000000, %f338, %p75;
+	.loc	2 112 17
+	fma.rn.f32 	%f340, %f339, %f336, %f331;
+	.loc	2 113 15
+	add.f32 	%f341, %f649, %f335;
+	.loc	2 113 30
+	mul.f32 	%f342, %f336, %f336;
+	.loc	2 113 38
+	mul.f32 	%f343, %f328, %f342;
+	.loc	2 113 22
+	fma.rn.f32 	%f344, %f339, %f343, %f341;
+	.loc	2 108 21
+	sub.f32 	%f345, %f666, %f340;
+	.loc	2 109 28
+	add.f32 	%f346, %f306, %f337;
+	.loc	2 110 39
+	setp.eq.f32 	%p76, %f346, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r193, %f346;
+	mov.b32 	%r192, %f306;
+	div.full.f32 %r191, %r192, %r193;
+	mov.b32 	%f347, %r191;
+	.loc	2 110 49
+	selp.f32 	%f348, 0f00000000, %f347, %p76;
+	.loc	2 112 17
+	fma.rn.f32 	%f349, %f348, %f345, %f340;
+	.loc	2 113 15
+	add.f32 	%f350, %f650, %f344;
+	.loc	2 113 30
+	mul.f32 	%f351, %f345, %f345;
+	.loc	2 113 38
+	mul.f32 	%f352, %f337, %f351;
+	.loc	2 113 22
+	fma.rn.f32 	%f353, %f348, %f352, %f350;
+	.loc	2 108 21
+	sub.f32 	%f354, %f667, %f349;
+	.loc	2 109 28
+	add.f32 	%f355, %f307, %f346;
+	.loc	2 110 39
+	setp.eq.f32 	%p77, %f355, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r196, %f355;
+	mov.b32 	%r195, %f307;
+	div.full.f32 %r194, %r195, %r196;
+	mov.b32 	%f356, %r194;
+	.loc	2 110 49
+	selp.f32 	%f357, 0f00000000, %f356, %p77;
+	.loc	2 112 17
+	fma.rn.f32 	%f358, %f357, %f354, %f349;
+	.loc	2 113 15
+	add.f32 	%f359, %f651, %f353;
+	.loc	2 113 30
+	mul.f32 	%f360, %f354, %f354;
+	.loc	2 113 38
+	mul.f32 	%f361, %f346, %f360;
+	.loc	2 113 22
+	fma.rn.f32 	%f362, %f357, %f361, %f359;
+	.loc	2 108 21
+	sub.f32 	%f363, %f668, %f358;
+	.loc	2 109 28
+	add.f32 	%f364, %f308, %f355;
+	.loc	2 110 39
+	setp.eq.f32 	%p78, %f364, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r199, %f364;
+	mov.b32 	%r198, %f308;
+	div.full.f32 %r197, %r198, %r199;
+	mov.b32 	%f365, %r197;
+	.loc	2 110 49
+	selp.f32 	%f366, 0f00000000, %f365, %p78;
+	.loc	2 112 17
+	fma.rn.f32 	%f367, %f366, %f363, %f358;
+	.loc	2 113 15
+	add.f32 	%f368, %f652, %f362;
+	.loc	2 113 30
+	mul.f32 	%f369, %f363, %f363;
+	.loc	2 113 38
+	mul.f32 	%f370, %f355, %f369;
+	.loc	2 113 22
+	fma.rn.f32 	%f371, %f366, %f370, %f368;
+	.loc	2 108 21
+	sub.f32 	%f372, %f669, %f367;
+	.loc	2 109 28
+	add.f32 	%f373, %f309, %f364;
+	.loc	2 110 39
+	setp.eq.f32 	%p79, %f373, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r202, %f373;
+	mov.b32 	%r201, %f309;
+	div.full.f32 %r200, %r201, %r202;
+	mov.b32 	%f374, %r200;
+	.loc	2 110 49
+	selp.f32 	%f375, 0f00000000, %f374, %p79;
+	.loc	2 112 17
+	fma.rn.f32 	%f376, %f375, %f372, %f367;
+	.loc	2 113 15
+	add.f32 	%f377, %f653, %f371;
+	.loc	2 113 30
+	mul.f32 	%f378, %f372, %f372;
+	.loc	2 113 38
+	mul.f32 	%f379, %f364, %f378;
+	.loc	2 113 22
+	fma.rn.f32 	%f380, %f375, %f379, %f377;
+	.loc	2 108 21
+	sub.f32 	%f381, %f671, %f670;
+	.loc	2 109 28
+	add.f32 	%f382, %f310, %f311;
+	.loc	2 110 39
+	setp.eq.f32 	%p80, %f382, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r204, %f311;
+	mov.b32 	%r205, %f382;
+	div.full.f32 %r203, %r204, %r205;
+	mov.b32 	%f383, %r203;
+	.loc	2 110 49
+	selp.f32 	%f384, 0f00000000, %f383, %p80;
+	.loc	2 112 17
+	fma.rn.f32 	%f385, %f381, %f384, %f670;
+	.loc	2 113 15
+	add.f32 	%f386, %f654, %f655;
+	.loc	2 113 30
+	mul.f32 	%f387, %f381, %f381;
+	.loc	2 113 38
+	mul.f32 	%f388, %f387, %f310;
+	.loc	2 113 22
+	fma.rn.f32 	%f389, %f388, %f384, %f386;
+	.loc	2 108 21
+	sub.f32 	%f390, %f672, %f385;
+	.loc	2 109 28
+	add.f32 	%f391, %f312, %f382;
+	.loc	2 110 39
+	setp.eq.f32 	%p81, %f391, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r208, %f391;
+	mov.b32 	%r207, %f312;
+	div.full.f32 %r206, %r207, %r208;
+	mov.b32 	%f392, %r206;
+	.loc	2 110 49
+	selp.f32 	%f393, 0f00000000, %f392, %p81;
+	.loc	2 112 17
+	fma.rn.f32 	%f394, %f393, %f390, %f385;
+	.loc	2 113 15
+	add.f32 	%f395, %f656, %f389;
+	.loc	2 113 30
+	mul.f32 	%f396, %f390, %f390;
+	.loc	2 113 38
+	mul.f32 	%f397, %f382, %f396;
+	.loc	2 113 22
+	fma.rn.f32 	%f398, %f393, %f397, %f395;
+	.loc	2 108 21
+	sub.f32 	%f399, %f673, %f394;
+	.loc	2 109 28
+	add.f32 	%f400, %f313, %f391;
+	.loc	2 110 39
+	setp.eq.f32 	%p82, %f400, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r211, %f400;
+	mov.b32 	%r210, %f313;
+	div.full.f32 %r209, %r210, %r211;
+	mov.b32 	%f401, %r209;
+	.loc	2 110 49
+	selp.f32 	%f402, 0f00000000, %f401, %p82;
+	.loc	2 112 17
+	fma.rn.f32 	%f403, %f402, %f399, %f394;
+	.loc	2 113 15
+	add.f32 	%f404, %f657, %f398;
+	.loc	2 113 30
+	mul.f32 	%f405, %f399, %f399;
+	.loc	2 113 38
+	mul.f32 	%f406, %f391, %f405;
+	.loc	2 113 22
+	fma.rn.f32 	%f407, %f402, %f406, %f404;
+	.loc	2 108 21
+	sub.f32 	%f408, %f674, %f403;
+	.loc	2 109 28
+	add.f32 	%f409, %f314, %f400;
+	.loc	2 110 39
+	setp.eq.f32 	%p83, %f409, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r214, %f409;
+	mov.b32 	%r213, %f314;
+	div.full.f32 %r212, %r213, %r214;
+	mov.b32 	%f410, %r212;
+	.loc	2 110 49
+	selp.f32 	%f411, 0f00000000, %f410, %p83;
+	.loc	2 112 17
+	fma.rn.f32 	%f412, %f411, %f408, %f403;
+	.loc	2 113 15
+	add.f32 	%f413, %f658, %f407;
+	.loc	2 113 30
+	mul.f32 	%f414, %f408, %f408;
+	.loc	2 113 38
+	mul.f32 	%f415, %f400, %f414;
+	.loc	2 113 22
+	fma.rn.f32 	%f416, %f411, %f415, %f413;
+	.loc	2 108 21
+	sub.f32 	%f417, %f675, %f412;
+	.loc	2 109 28
+	add.f32 	%f418, %f315, %f409;
+	.loc	2 110 39
+	setp.eq.f32 	%p84, %f418, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r217, %f418;
+	mov.b32 	%r216, %f315;
+	div.full.f32 %r215, %r216, %r217;
+	mov.b32 	%f419, %r215;
+	.loc	2 110 49
+	selp.f32 	%f420, 0f00000000, %f419, %p84;
+	.loc	2 112 17
+	fma.rn.f32 	%f421, %f420, %f417, %f412;
+	.loc	2 113 15
+	add.f32 	%f422, %f659, %f416;
+	.loc	2 113 30
+	mul.f32 	%f423, %f417, %f417;
+	.loc	2 113 38
+	mul.f32 	%f424, %f409, %f423;
+	.loc	2 113 22
+	fma.rn.f32 	%f425, %f420, %f424, %f422;
+	.loc	2 108 21
+	sub.f32 	%f426, %f676, %f421;
+	.loc	2 109 28
+	add.f32 	%f427, %f316, %f418;
+	.loc	2 110 39
+	setp.eq.f32 	%p85, %f427, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r220, %f427;
+	mov.b32 	%r219, %f316;
+	div.full.f32 %r218, %r219, %r220;
+	mov.b32 	%f428, %r218;
+	.loc	2 110 49
+	selp.f32 	%f429, 0f00000000, %f428, %p85;
+	.loc	2 112 17
+	fma.rn.f32 	%f430, %f429, %f426, %f421;
+	.loc	2 113 15
+	add.f32 	%f431, %f660, %f425;
+	.loc	2 113 30
+	mul.f32 	%f432, %f426, %f426;
+	.loc	2 113 38
+	mul.f32 	%f433, %f418, %f432;
+	.loc	2 113 22
+	fma.rn.f32 	%f434, %f429, %f433, %f431;
+	.loc	2 108 21
+	sub.f32 	%f435, %f677, %f430;
+	.loc	2 109 28
+	add.f32 	%f436, %f317, %f427;
+	.loc	2 110 39
+	setp.eq.f32 	%p86, %f436, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r223, %f436;
+	mov.b32 	%r222, %f317;
+	div.full.f32 %r221, %r222, %r223;
+	mov.b32 	%f437, %r221;
+	.loc	2 110 49
+	selp.f32 	%f438, 0f00000000, %f437, %p86;
+	.loc	2 112 17
+	fma.rn.f32 	%f439, %f438, %f435, %f430;
+	.loc	2 113 15
+	add.f32 	%f440, %f661, %f434;
+	.loc	2 113 30
+	mul.f32 	%f441, %f435, %f435;
+	.loc	2 113 38
+	mul.f32 	%f442, %f427, %f441;
+	.loc	2 113 22
+	fma.rn.f32 	%f443, %f438, %f442, %f440;
+$L__tmp4:
+	.loc	2 120 46
+	mov.b32 	%r299, %f376;
+	shfl.sync.bfly.b32	%r300, %r299, 4, 31, -1;
+	mov.b32 	%f444, %r300;
+	mov.b32 	%r301, %f380;
+	shfl.sync.bfly.b32	%r302, %r301, 4, 31, -1;
+	mov.b32 	%f445, %r302;
+	shfl.sync.bfly.b32	%r225, %r202, 4, 31, -1;
+	mov.b32 	%f446, %r225;
+$L__tmp5:
+	.loc	2 108 21
+	sub.f32 	%f447, %f444, %f376;
+	.loc	2 109 28
+	add.f32 	%f448, %f373, %f446;
+	.loc	2 110 39
+	setp.eq.f32 	%p87, %f448, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r226, %f448;
+	div.full.f32 %r224, %r225, %r226;
+	mov.b32 	%f449, %r224;
+	.loc	2 110 49
+	selp.f32 	%f450, 0f00000000, %f449, %p87;
+	.loc	2 112 17
+	fma.rn.f32 	%f451, %f450, %f447, %f376;
+	.loc	2 113 15
+	add.f32 	%f452, %f380, %f445;
+	.loc	2 113 30
+	mul.f32 	%f453, %f447, %f447;
+	.loc	2 113 38
+	mul.f32 	%f454, %f373, %f453;
+	.loc	2 113 22
+	fma.rn.f32 	%f455, %f450, %f454, %f452;
+$L__tmp6:
+	.loc	2 120 46
+	mov.b32 	%r303, %f451;
+	shfl.sync.bfly.b32	%r304, %r303, 2, 31, -1;
+	mov.b32 	%f456, %r304;
+	mov.b32 	%r305, %f455;
+	shfl.sync.bfly.b32	%r306, %r305, 2, 31, -1;
+	mov.b32 	%f457, %r306;
+	shfl.sync.bfly.b32	%r228, %r226, 2, 31, -1;
+	mov.b32 	%f458, %r228;
+$L__tmp7:
+	.loc	2 108 21
+	sub.f32 	%f459, %f456, %f451;
+	.loc	2 109 28
+	add.f32 	%f460, %f448, %f458;
+	.loc	2 110 39
+	setp.eq.f32 	%p88, %f460, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r229, %f460;
+	div.full.f32 %r227, %r228, %r229;
+	mov.b32 	%f461, %r227;
+	.loc	2 110 49
+	selp.f32 	%f462, 0f00000000, %f461, %p88;
+	.loc	2 112 17
+	fma.rn.f32 	%f463, %f462, %f459, %f451;
+	.loc	2 113 15
+	add.f32 	%f464, %f455, %f457;
+	.loc	2 113 30
+	mul.f32 	%f465, %f459, %f459;
+	.loc	2 113 38
+	mul.f32 	%f466, %f448, %f465;
+	.loc	2 113 22
+	fma.rn.f32 	%f467, %f462, %f466, %f464;
+$L__tmp8:
+	.loc	2 120 46
+	mov.b32 	%r307, %f463;
+	shfl.sync.bfly.b32	%r308, %r307, 1, 31, -1;
+	mov.b32 	%f468, %r308;
+	mov.b32 	%r309, %f467;
+	shfl.sync.bfly.b32	%r310, %r309, 1, 31, -1;
+	mov.b32 	%f469, %r310;
+	shfl.sync.bfly.b32	%r231, %r229, 1, 31, -1;
+	mov.b32 	%f470, %r231;
+$L__tmp9:
+	.loc	2 108 21
+	sub.f32 	%f471, %f468, %f463;
+	.loc	2 109 28
+	add.f32 	%f472, %f460, %f470;
+	.loc	2 110 39
+	setp.eq.f32 	%p89, %f472, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r232, %f472;
+	div.full.f32 %r230, %r231, %r232;
+	mov.b32 	%f473, %r230;
+	.loc	2 110 49
+	selp.f32 	%f474, 0f00000000, %f473, %p89;
+	.loc	2 112 17
+	fma.rn.f32 	%f161, %f471, %f474, %f463;
+	.loc	2 113 15
+	add.f32 	%f475, %f467, %f469;
+	.loc	2 113 30
+	mul.f32 	%f476, %f471, %f471;
+	.loc	2 113 38
+	mul.f32 	%f477, %f460, %f476;
+	.loc	2 113 22
+	fma.rn.f32 	%f478, %f474, %f477, %f475;
+$L__tmp10:
+	.loc	2 120 46
+	mov.b32 	%r311, %f439;
+	shfl.sync.bfly.b32	%r312, %r311, 4, 31, -1;
+	mov.b32 	%f479, %r312;
+	mov.b32 	%r313, %f443;
+	shfl.sync.bfly.b32	%r314, %r313, 4, 31, -1;
+	mov.b32 	%f480, %r314;
+	shfl.sync.bfly.b32	%r234, %r223, 4, 31, -1;
+	mov.b32 	%f481, %r234;
+$L__tmp11:
+	.loc	2 108 21
+	sub.f32 	%f482, %f479, %f439;
+	.loc	2 109 28
+	add.f32 	%f483, %f436, %f481;
+	.loc	2 110 39
+	setp.eq.f32 	%p90, %f483, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r235, %f483;
+	div.full.f32 %r233, %r234, %r235;
+	mov.b32 	%f484, %r233;
+	.loc	2 110 49
+	selp.f32 	%f485, 0f00000000, %f484, %p90;
+	.loc	2 112 17
+	fma.rn.f32 	%f486, %f482, %f485, %f439;
+	.loc	2 113 15
+	add.f32 	%f487, %f443, %f480;
+	.loc	2 113 30
+	mul.f32 	%f488, %f482, %f482;
+	.loc	2 113 38
+	mul.f32 	%f489, %f436, %f488;
+	.loc	2 113 22
+	fma.rn.f32 	%f490, %f489, %f485, %f487;
+$L__tmp12:
+	.loc	2 120 46
+	mov.b32 	%r315, %f486;
+	shfl.sync.bfly.b32	%r316, %r315, 2, 31, -1;
+	mov.b32 	%f491, %r316;
+	mov.b32 	%r317, %f490;
+	shfl.sync.bfly.b32	%r318, %r317, 2, 31, -1;
+	mov.b32 	%f492, %r318;
+	shfl.sync.bfly.b32	%r237, %r235, 2, 31, -1;
+	mov.b32 	%f493, %r237;
+$L__tmp13:
+	.loc	2 108 21
+	sub.f32 	%f494, %f491, %f486;
+	.loc	2 109 28
+	add.f32 	%f495, %f483, %f493;
+	.loc	2 110 39
+	setp.eq.f32 	%p91, %f495, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r238, %f495;
+	div.full.f32 %r236, %r237, %r238;
+	mov.b32 	%f496, %r236;
+	.loc	2 110 49
+	selp.f32 	%f497, 0f00000000, %f496, %p91;
+	.loc	2 112 17
+	fma.rn.f32 	%f498, %f494, %f497, %f486;
+	.loc	2 113 15
+	add.f32 	%f499, %f490, %f492;
+	.loc	2 113 30
+	mul.f32 	%f500, %f494, %f494;
+	.loc	2 113 38
+	mul.f32 	%f501, %f483, %f500;
+	.loc	2 113 22
+	fma.rn.f32 	%f502, %f497, %f501, %f499;
+$L__tmp14:
+	.loc	2 120 46
+	mov.b32 	%r319, %f498;
+	shfl.sync.bfly.b32	%r320, %r319, 1, 31, -1;
+	mov.b32 	%f503, %r320;
+	mov.b32 	%r321, %f502;
+	shfl.sync.bfly.b32	%r322, %r321, 1, 31, -1;
+	mov.b32 	%f504, %r322;
+	shfl.sync.bfly.b32	%r240, %r238, 1, 31, -1;
+	mov.b32 	%f505, %r240;
+$L__tmp15:
+	.loc	2 108 21
+	sub.f32 	%f506, %f503, %f498;
+	.loc	2 109 28
+	add.f32 	%f507, %f495, %f505;
+	.loc	2 110 39
+	setp.eq.f32 	%p92, %f507, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r241, %f507;
+	div.full.f32 %r239, %r240, %r241;
+	mov.b32 	%f508, %r239;
+	.loc	2 110 49
+	selp.f32 	%f509, 0f00000000, %f508, %p92;
+	.loc	2 112 17
+	fma.rn.f32 	%f162, %f506, %f509, %f498;
+	.loc	2 113 15
+	add.f32 	%f510, %f502, %f504;
+	.loc	2 113 30
+	mul.f32 	%f511, %f506, %f506;
+	.loc	2 113 38
+	mul.f32 	%f512, %f495, %f511;
+	.loc	2 113 22
+	fma.rn.f32 	%f513, %f509, %f512, %f510;
+$L__tmp16:
+	.loc	1 75 24
+	mov.b32 	%r243, %f478;
+	mov.b32 	%r244, 1132462080;
+	div.full.f32 %r242, %r243, %r244;
+	mov.b32 	%f514, %r242;
+	mov.b32 	%r267, %f513;
+	div.full.f32 %r266, %r267, %r244;
+	mov.b32 	%f515, %r266;
+	.loc	1 77 24
+	add.f32 	%f163, %f514, 0f3727C5AC;
+	add.f32 	%f164, %f515, 0f3727C5AC;
+	.loc	1 58 36
+	add.s64 	%rd9, %rd15, %rd2;
+	mov.u64 	%rd117, 0;
+	mov.b32 	%r473, -64;
+	rsqrt.approx.ftz.f32 	%f580, %f163;
+	rsqrt.approx.ftz.f32 	%f581, %f164;
+	bra.uni 	$L__BB0_5;
+$L__BB0_7:
+	.loc	1 69 35
+	add.s64 	%rd107, %rd4, %rd117;
+	add.s64 	%rd108, %rd107, 16;
+	add.s64 	%rd109, %rd3, %rd117;
+	.loc	1 69 54
+	add.s64 	%rd110, %rd109, 16;
+	mov.u32 %r407, 0x0;
+	mov.u32 %r408, 0x0;
+	mov.u32 %r409, 0x0;
+	mov.u32 %r410, 0x0;
+	@%p1 ld.global.L1::evict_first.v4.b32 { %r407, %r408, %r409, %r410 }, [ %rd107 + 0 ];
+	@!%p1 mov.u32 %r407, %r411;
+	@!%p1 mov.u32 %r408, %r411;
+	@!%p1 mov.u32 %r409, %r411;
+	@!%p1 mov.u32 %r410, %r411;
+	mov.b32 	%f516, %r407;
+	mov.b32 	%f517, %r408;
+	mov.b32 	%f518, %r409;
+	mov.b32 	%f519, %r410;
+	mov.u32 %r415, 0x0;
+	mov.u32 %r416, 0x0;
+	mov.u32 %r417, 0x0;
+	mov.u32 %r418, 0x0;
+	@%p1 ld.global.L1::evict_first.v4.b32 { %r415, %r416, %r417, %r418 }, [ %rd108 + 0 ];
+	@!%p1 mov.u32 %r415, %r411;
+	@!%p1 mov.u32 %r416, %r411;
+	@!%p1 mov.u32 %r417, %r411;
+	@!%p1 mov.u32 %r418, %r411;
+	mov.b32 	%f520, %r415;
+	mov.b32 	%f521, %r416;
+	mov.b32 	%f522, %r417;
+	mov.b32 	%f523, %r418;
+	mov.u32 %r423, 0x0;
+	mov.u32 %r424, 0x0;
+	mov.u32 %r425, 0x0;
+	mov.u32 %r426, 0x0;
+	@%p1 ld.global.L1::evict_first.v4.b32 { %r423, %r424, %r425, %r426 }, [ %rd109 + 0 ];
+	@!%p1 mov.u32 %r423, %r411;
+	@!%p1 mov.u32 %r424, %r411;
+	@!%p1 mov.u32 %r425, %r411;
+	@!%p1 mov.u32 %r426, %r411;
+	mov.b32 	%f524, %r423;
+	mov.b32 	%f525, %r424;
+	mov.b32 	%f526, %r425;
+	mov.b32 	%f527, %r426;
+	mov.u32 %r431, 0x0;
+	mov.u32 %r432, 0x0;
+	mov.u32 %r433, 0x0;
+	mov.u32 %r434, 0x0;
+	@%p1 ld.global.L1::evict_first.v4.b32 { %r431, %r432, %r433, %r434 }, [ %rd110 + 0 ];
+	@!%p1 mov.u32 %r431, %r411;
+	@!%p1 mov.u32 %r432, %r411;
+	@!%p1 mov.u32 %r433, %r411;
+	@!%p1 mov.u32 %r434, %r411;
+	mov.b32 	%f528, %r431;
+	mov.b32 	%f529, %r432;
+	mov.b32 	%f530, %r433;
+	mov.b32 	%f531, %r434;
+	.loc	1 70 24
+	add.f32 	%f532, %f165, %f516;
+	add.f32 	%f533, %f166, %f517;
+	add.f32 	%f534, %f167, %f518;
+	add.f32 	%f535, %f168, %f519;
+	add.f32 	%f536, %f169, %f520;
+	add.f32 	%f537, %f170, %f521;
+	add.f32 	%f538, %f171, %f522;
+	add.f32 	%f539, %f172, %f523;
+	add.f32 	%f540, %f173, %f524;
+	add.f32 	%f541, %f174, %f525;
+	add.f32 	%f542, %f175, %f526;
+	add.f32 	%f543, %f176, %f527;
+	add.f32 	%f544, %f177, %f528;
+	add.f32 	%f545, %f178, %f529;
+	add.f32 	%f546, %f179, %f530;
+	add.f32 	%f547, %f180, %f531;
+	.loc	1 72 24
+	add.f32 	%f548, %f181, %f532;
+	add.f32 	%f549, %f182, %f533;
+	add.f32 	%f550, %f183, %f534;
+	add.f32 	%f551, %f184, %f535;
+	add.f32 	%f552, %f185, %f536;
+	add.f32 	%f553, %f186, %f537;
+	add.f32 	%f554, %f187, %f538;
+	add.f32 	%f555, %f188, %f539;
+	add.f32 	%f556, %f189, %f540;
+	add.f32 	%f557, %f190, %f541;
+	add.f32 	%f558, %f191, %f542;
+	add.f32 	%f559, %f192, %f543;
+	add.f32 	%f560, %f193, %f544;
+	add.f32 	%f561, %f194, %f545;
+	add.f32 	%f562, %f195, %f546;
+	add.f32 	%f563, %f196, %f547;
+	.loc	1 73 24
+	sub.f32 	%f564, %f548, %f161;
+	sub.f32 	%f565, %f549, %f161;
+	sub.f32 	%f566, %f550, %f161;
+	sub.f32 	%f567, %f551, %f161;
+	sub.f32 	%f568, %f552, %f161;
+	sub.f32 	%f569, %f553, %f161;
+	sub.f32 	%f570, %f554, %f161;
+	sub.f32 	%f571, %f555, %f161;
+	sub.f32 	%f572, %f556, %f162;
+	sub.f32 	%f573, %f557, %f162;
+	sub.f32 	%f574, %f558, %f162;
+	sub.f32 	%f575, %f559, %f162;
+	sub.f32 	%f576, %f560, %f162;
+	sub.f32 	%f577, %f561, %f162;
+	sub.f32 	%f578, %f562, %f162;
+	sub.f32 	%f579, %f563, %f162;
+	.loc	1 79 24
+	mul.f32 	%f582, %f564, %f580;
+	mul.f32 	%f583, %f565, %f580;
+	mul.f32 	%f584, %f566, %f580;
+	mul.f32 	%f585, %f567, %f580;
+	mul.f32 	%f586, %f568, %f580;
+	mul.f32 	%f587, %f569, %f580;
+	mul.f32 	%f588, %f570, %f580;
+	mul.f32 	%f589, %f571, %f580;
+	mul.f32 	%f590, %f572, %f581;
+	mul.f32 	%f591, %f573, %f581;
+	mul.f32 	%f592, %f574, %f581;
+	mul.f32 	%f593, %f575, %f581;
+	mul.f32 	%f594, %f576, %f581;
+	mul.f32 	%f595, %f577, %f581;
+	mul.f32 	%f596, %f578, %f581;
+	mul.f32 	%f597, %f579, %f581;
+	.loc	1 80 24
+	mul.f32 	%f598, %f582, %f197;
+	mul.f32 	%f599, %f583, %f198;
+	mul.f32 	%f600, %f584, %f199;
+	mul.f32 	%f601, %f585, %f200;
+	mul.f32 	%f602, %f586, %f201;
+	mul.f32 	%f603, %f587, %f202;
+	mul.f32 	%f604, %f588, %f203;
+	mul.f32 	%f605, %f589, %f204;
+	mul.f32 	%f606, %f590, %f197;
+	mul.f32 	%f607, %f591, %f198;
+	mul.f32 	%f608, %f592, %f199;
+	mul.f32 	%f609, %f593, %f200;
+	mul.f32 	%f610, %f594, %f201;
+	mul.f32 	%f611, %f595, %f202;
+	mul.f32 	%f612, %f596, %f203;
+	mul.f32 	%f613, %f597, %f204;
+	.loc	1 82 29
+	shl.b64 	%rd113, %rd11, 1;
+	add.s64 	%rd111, %rd16, %rd113;
+	shl.b64 	%rd114, %rd12, 1;
+	add.s64 	%rd112, %rd16, %rd114;
+	.loc	1 82 52
+	mov.b32 	%r439, %f598;
+	cvt.rn.bf16.f32 %rs33, %r439;
+	mov.b32 	%r440, %f599;
+	cvt.rn.bf16.f32 %rs34, %r440;
+	mov.b32 	%r441, %f600;
+	cvt.rn.bf16.f32 %rs35, %r441;
+	mov.b32 	%r442, %f601;
+	cvt.rn.bf16.f32 %rs36, %r442;
+	mov.b32 	%r443, %f602;
+	cvt.rn.bf16.f32 %rs37, %r443;
+	mov.b32 	%r444, %f603;
+	cvt.rn.bf16.f32 %rs38, %r444;
+	mov.b32 	%r445, %f604;
+	cvt.rn.bf16.f32 %rs39, %r445;
+	mov.b32 	%r446, %f605;
+	cvt.rn.bf16.f32 %rs40, %r446;
+	mov.b32 	%r447, %f606;
+	cvt.rn.bf16.f32 %rs41, %r447;
+	mov.b32 	%r448, %f607;
+	cvt.rn.bf16.f32 %rs42, %r448;
+	mov.b32 	%r449, %f608;
+	cvt.rn.bf16.f32 %rs43, %r449;
+	mov.b32 	%r450, %f609;
+	cvt.rn.bf16.f32 %rs44, %r450;
+	mov.b32 	%r451, %f610;
+	cvt.rn.bf16.f32 %rs45, %r451;
+	mov.b32 	%r452, %f611;
+	cvt.rn.bf16.f32 %rs46, %r452;
+	mov.b32 	%r453, %f612;
+	cvt.rn.bf16.f32 %rs47, %r453;
+	mov.b32 	%r454, %f613;
+	cvt.rn.bf16.f32 %rs48, %r454;
+	mov.b32 	%r463, {%rs33, %rs34};
+	mov.b32 	%r464, {%rs35, %rs36};
+	mov.b32 	%r465, {%rs37, %rs38};
+	mov.b32 	%r466, {%rs39, %rs40};
+	@%p1 st.global.v4.b32 [ %rd111 + 0 ], { %r463, %r464, %r465, %r466 };
+	mov.b32 	%r467, {%rs41, %rs42};
+	mov.b32 	%r468, {%rs43, %rs44};
+	mov.b32 	%r469, {%rs45, %rs46};
+	mov.b32 	%r470, {%rs47, %rs48};
+	@%p1 st.global.v4.b32 [ %rd112 + 0 ], { %r467, %r468, %r469, %r470 };
+	.loc	1 58 36
+	add.s64 	%rd117, %rd117, 256;
+	add.s32 	%r473, %r473, 64;
+	setp.lt.u32 	%p156, %r473, 192;
+	@%p156 bra 	$L__BB0_5;
+	bra.uni 	$L__BB0_8;
+$L__BB0_5:
+	.loc	1 62 35
+	add.s64 	%rd90, %rd6, %rd117;
+	add.s64 	%rd91, %rd90, 16;
+	add.s64 	%rd92, %rd5, %rd117;
+	.loc	1 62 51
+	add.s64 	%rd93, %rd92, 16;
+	mov.u32 %r323, 0x0;
+	mov.u32 %r324, 0x0;
+	mov.u32 %r325, 0x0;
+	mov.u32 %r326, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r323, %r324, %r325, %r326 }, [ %rd90 + 0 ];
+	@!%p1 mov.u32 %r323, %r411;
+	@!%p1 mov.u32 %r324, %r411;
+	@!%p1 mov.u32 %r325, %r411;
+	@!%p1 mov.u32 %r326, %r411;
+	mov.b32 	%f165, %r323;
+	mov.b32 	%f166, %r324;
+	mov.b32 	%f167, %r325;
+	mov.b32 	%f168, %r326;
+	mov.u32 %r331, 0x0;
+	mov.u32 %r332, 0x0;
+	mov.u32 %r333, 0x0;
+	mov.u32 %r334, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r331, %r332, %r333, %r334 }, [ %rd91 + 0 ];
+	@!%p1 mov.u32 %r331, %r411;
+	@!%p1 mov.u32 %r332, %r411;
+	@!%p1 mov.u32 %r333, %r411;
+	@!%p1 mov.u32 %r334, %r411;
+	mov.b32 	%f169, %r331;
+	mov.b32 	%f170, %r332;
+	mov.b32 	%f171, %r333;
+	mov.b32 	%f172, %r334;
+	mov.u32 %r339, 0x0;
+	mov.u32 %r340, 0x0;
+	mov.u32 %r341, 0x0;
+	mov.u32 %r342, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r339, %r340, %r341, %r342 }, [ %rd92 + 0 ];
+	@!%p1 mov.u32 %r339, %r411;
+	@!%p1 mov.u32 %r340, %r411;
+	@!%p1 mov.u32 %r341, %r411;
+	@!%p1 mov.u32 %r342, %r411;
+	mov.b32 	%f173, %r339;
+	mov.b32 	%f174, %r340;
+	mov.b32 	%f175, %r341;
+	mov.b32 	%f176, %r342;
+	mov.u32 %r347, 0x0;
+	mov.u32 %r348, 0x0;
+	mov.u32 %r349, 0x0;
+	mov.u32 %r350, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r347, %r348, %r349, %r350 }, [ %rd93 + 0 ];
+	@!%p1 mov.u32 %r347, %r411;
+	@!%p1 mov.u32 %r348, %r411;
+	@!%p1 mov.u32 %r349, %r411;
+	@!%p1 mov.u32 %r350, %r411;
+	mov.b32 	%f177, %r347;
+	mov.b32 	%f178, %r348;
+	mov.b32 	%f179, %r349;
+	mov.b32 	%f180, %r350;
+	.loc	1 63 41
+	add.s32 	%r403, %r5, %r473;
+	add.s32 	%r404, %r403, 64;
+	.loc	1 63 35
+	add.s32 	%r405, %r403, 8256;
+	cvt.s64.s32 	%rd11, %r404;
+	mul.wide.s32 	%rd98, %r404, 2;
+	add.s64 	%rd94, %rd14, %rd98;
+	cvt.s64.s32 	%rd12, %r405;
+	mul.wide.s32 	%rd99, %r405, 2;
+	add.s64 	%rd95, %rd14, %rd99;
+	.loc	1 63 51
+	mov.u32 %r355, 0x0;
+	mov.u32 %r356, 0x0;
+	mov.u32 %r357, 0x0;
+	mov.u32 %r358, 0x0;
+	@%p1 ld.global.L1::evict_first.v4.b32 { %r355, %r356, %r357, %r358 }, [ %rd94 + 0 ];
+	@!%p1 mov.u32 %r355, %r411;
+	@!%p1 mov.u32 %r356, %r411;
+	@!%p1 mov.u32 %r357, %r411;
+	@!%p1 mov.u32 %r358, %r411;
+	cvt.u16.u32 	%rs17, %r355;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs18}, %r355; }
+	cvt.u16.u32 	%rs19, %r356;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs20}, %r356; }
+	cvt.u16.u32 	%rs21, %r357;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs22}, %r357; }
+	cvt.u16.u32 	%rs23, %r358;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs24}, %r358; }
+	mov.u32 %r363, 0x0;
+	mov.u32 %r364, 0x0;
+	mov.u32 %r365, 0x0;
+	mov.u32 %r366, 0x0;
+	@%p1 ld.global.L1::evict_first.v4.b32 { %r363, %r364, %r365, %r366 }, [ %rd95 + 0 ];
+	@!%p1 mov.u32 %r363, %r411;
+	@!%p1 mov.u32 %r364, %r411;
+	@!%p1 mov.u32 %r365, %r411;
+	@!%p1 mov.u32 %r366, %r411;
+	cvt.u16.u32 	%rs25, %r363;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs26}, %r363; }
+	cvt.u16.u32 	%rs27, %r364;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs28}, %r364; }
+	cvt.u16.u32 	%rs29, %r365;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs30}, %r365; }
+	cvt.u16.u32 	%rs31, %r366;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs32}, %r366; }
+	.loc	1 63 103
+	cvt.f32.bf16 %r371, %rs17;
+	mov.b32 	%f181, %r371;
+	cvt.f32.bf16 %r372, %rs18;
+	mov.b32 	%f182, %r372;
+	cvt.f32.bf16 %r373, %rs19;
+	mov.b32 	%f183, %r373;
+	cvt.f32.bf16 %r374, %rs20;
+	mov.b32 	%f184, %r374;
+	cvt.f32.bf16 %r375, %rs21;
+	mov.b32 	%f185, %r375;
+	cvt.f32.bf16 %r376, %rs22;
+	mov.b32 	%f186, %r376;
+	cvt.f32.bf16 %r377, %rs23;
+	mov.b32 	%f187, %r377;
+	cvt.f32.bf16 %r378, %rs24;
+	mov.b32 	%f188, %r378;
+	cvt.f32.bf16 %r379, %rs25;
+	mov.b32 	%f189, %r379;
+	cvt.f32.bf16 %r380, %rs26;
+	mov.b32 	%f190, %r380;
+	cvt.f32.bf16 %r381, %rs27;
+	mov.b32 	%f191, %r381;
+	cvt.f32.bf16 %r382, %rs28;
+	mov.b32 	%f192, %r382;
+	cvt.f32.bf16 %r383, %rs29;
+	mov.b32 	%f193, %r383;
+	cvt.f32.bf16 %r384, %rs30;
+	mov.b32 	%f194, %r384;
+	cvt.f32.bf16 %r385, %rs31;
+	mov.b32 	%f195, %r385;
+	cvt.f32.bf16 %r386, %rs32;
+	mov.b32 	%f196, %r386;
+	.loc	1 64 35
+	add.s64 	%rd96, %rd9, %rd117;
+	.loc	1 64 40
+	add.s64 	%rd97, %rd96, 16;
+	mov.u32 %r387, 0x0;
+	mov.u32 %r388, 0x0;
+	mov.u32 %r389, 0x0;
+	mov.u32 %r390, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r387, %r388, %r389, %r390 }, [ %rd96 + 0 ];
+	@!%p1 mov.u32 %r387, %r411;
+	@!%p1 mov.u32 %r388, %r411;
+	@!%p1 mov.u32 %r389, %r411;
+	@!%p1 mov.u32 %r390, %r411;
+	mov.b32 	%f197, %r387;
+	mov.b32 	%f198, %r388;
+	mov.b32 	%f199, %r389;
+	mov.b32 	%f200, %r390;
+	mov.u32 %r395, 0x0;
+	mov.u32 %r396, 0x0;
+	mov.u32 %r397, 0x0;
+	mov.u32 %r398, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r395, %r396, %r397, %r398 }, [ %rd97 + 0 ];
+	@!%p1 mov.u32 %r395, %r411;
+	@!%p1 mov.u32 %r396, %r411;
+	@!%p1 mov.u32 %r397, %r411;
+	@!%p1 mov.u32 %r398, %r411;
+	mov.b32 	%f201, %r395;
+	mov.b32 	%f202, %r396;
+	mov.b32 	%f203, %r397;
+	mov.b32 	%f204, %r398;
+	.loc	1 68 57
+	@%p51 bra 	$L__BB0_7;
+	mov.u64 	%rd100, assertMessage_1;
+	cvta.global.u64 	%rd101, %rd100;
+	mov.u64 	%rd102, assertFile_1;
+	cvta.global.u64 	%rd103, %rd102;
+	mov.u64 	%rd104, assertFunc_1;
+	cvta.global.u64 	%rd105, %rd104;
+	{ // callseq 7, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd101;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd103;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r471;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd105;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd115;
+	call.uni
+	__assertfail,
+	(
+	param0,
+	param1,
+	param2,
+	param3,
+	param4
+	);
+	} // callseq 7
+	bra.uni 	$L__BB0_7;
+$L__BB0_8:
+	.loc	1 58 4
+	ret;
+$L__tmp17:
+$L__func_end0:
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+}
+	.file	1 "/tmp/torchinductor_root/ci/ccig6fki6p4lxrdmgg6eudahiexcvueeol2p4qp532pvve2y463y.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 302
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 99
+.b8 105
+.b8 103
+.b8 54
+.b8 102
+.b8 107
+.b8 105
+.b8 54
+.b8 112
+.b8 52
+.b8 108
+.b8 120
+.b8 114
+.b8 100
+.b8 109
+.b8 103
+.b8 103
+.b8 54
+.b8 101
+.b8 117
+.b8 100
+.b8 97
+.b8 104
+.b8 105
+.b8 101
+.b8 120
+.b8 99
+.b8 118
+.b8 117
+.b8 101
+.b8 101
+.b8 111
+.b8 108
+.b8 50
+.b8 112
+.b8 52
+.b8 113
+.b8 112
+.b8 53
+.b8 51
+.b8 50
+.b8 112
+.b8 118
+.b8 118
+.b8 101
+.b8 50
+.b8 121
+.b8 52
+.b8 54
+.b8 51
+.b8 121
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 99
+.b8 105
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp2
+.b8 2
+.b8 47
+.b8 41
+.b8 5
+.b32 125
+.b64 $L__tmp3
+.b64 $L__tmp16
+.b8 2
+.b8 53
+.b8 44
+.b8 4
+.b32 125
+.b64 $L__tmp3
+.b64 $L__tmp16
+.b8 2
+.b8 120
+.b8 46
+.b8 0
+.b8 4
+.b32 125
+.b64 $L__tmp4
+.b64 $L__tmp15
+.b8 2
+.b8 53
+.b8 44
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 306
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 306
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}

.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,164 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<512> : tensor<64x1xi32, #blocked>
+    %cst_0 = arith.constant dense<256> : tensor<1x64xi32, #blocked>
+    %cst_1 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked>
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x64xf32, #blocked>
+    %cst_4 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked>
+    %cst_5 = arith.constant dense<256> : tensor<64x1xi64, #blocked>
+    %cst_6 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
+    %cst_7 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
+    %cst_8 = arith.constant dense<50257> : tensor<64x1xi64, #blocked1>
+    %cst_9 = arith.constant dense<0> : tensor<64x1xi64, #blocked1>
+    %c0_i32 = arith.constant 0 : i32
+    %c64_i32 = arith.constant 64 : i32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_10 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked2>
+    %cst_11 = arith.constant 0.000000e+00 : f32
+    %cst_12 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked2>
+    %cst_13 = arith.constant dense<256> : tensor<1x64xi32, #blocked2>
+    %cst_14 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32, #blocked>
+    %cst_15 = arith.constant dense<2.560000e+02> : tensor<64x1xf32, #blocked>
+    %cst_16 = arith.constant dense<0.000000e+00> : tensor<64x64xbf16, #blocked>
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c64_i32 : i32
+    %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
+    %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
+    %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
+    %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
+    %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
+    %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
+    %10 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
+    %11 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
+    %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x64xi32, #blocked>
+    %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x64xi32, #blocked2>
+    %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
+    %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked1>
+    %16 = tt.addptr %14, %8 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
+    %17 = tt.addptr %15, %9 : tensor<64x1x!tt.ptr<i64, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
+    %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
+    %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked1>
+    %20 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
+    %21 = arith.muli %20, %cst_1 : tensor<64x1xi32, #blocked>
+    %22 = tt.broadcast %21 : (tensor<64x1xi32, #blocked>) -> tensor<64x64xi32, #blocked>
+    %23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>, #blocked>
+    %24 = arith.muli %8, %cst_1 : tensor<64x1xi32, #blocked>
+    %25 = tt.broadcast %24 : (tensor<64x1xi32, #blocked>) -> tensor<64x64xi32, #blocked>
+    %26 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>, #blocked>
+    %27 = arith.addi %18, %cst_7 : tensor<64x1xi64, #blocked>
+    %28 = arith.addi %19, %cst_8 : tensor<64x1xi64, #blocked1>
+    %29 = arith.cmpi slt, %18, %cst_6 : tensor<64x1xi64, #blocked>
+    %30 = arith.cmpi slt, %19, %cst_9 : tensor<64x1xi64, #blocked1>
+    %31 = arith.select %29, %27, %18 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
+    %32 = arith.select %30, %28, %19 : tensor<64x1xi1, #blocked1>, tensor<64x1xi64, #blocked1>
+    %33 = arith.cmpi sge, %32, %cst_9 : tensor<64x1xi64, #blocked1>
+    %34 = arith.cmpi slt, %32, %cst_8 : tensor<64x1xi64, #blocked1>
+    %35 = arith.andi %33, %34 : tensor<64x1xi1, #blocked1>
+    %36 = arith.muli %31, %cst_5 : tensor<64x1xi64, #blocked>
+    %37 = tt.broadcast %36 : (tensor<64x1xi64, #blocked>) -> tensor<64x64xi64, #blocked>
+    %38 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>, #blocked>
+    %39:4 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c64_i32 iter_args(%arg9 = %cst_2, %arg10 = %cst_2, %arg11 = %cst_12, %arg12 = %cst_2) -> (tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked>)  : i32 {
+      %49 = tt.splat %arg8 : (i32) -> tensor<1x64xi32, #blocked>
+      %50 = tt.splat %arg8 : (i32) -> tensor<1x64xi32, #blocked2>
+      %51 = arith.addi %49, %12 : tensor<1x64xi32, #blocked>
+      %52 = arith.addi %50, %13 : tensor<1x64xi32, #blocked2>
+      %53 = arith.cmpi slt, %51, %cst_0 : tensor<1x64xi32, #blocked>
+      %54 = arith.cmpi slt, %52, %cst_13 : tensor<1x64xi32, #blocked2>
+      %55 = tt.broadcast %51 : (tensor<1x64xi32, #blocked>) -> tensor<64x64xi32, #blocked>
+      %56 = arith.addi %55, %22 : tensor<64x64xi32, #blocked>
+      %57 = tt.addptr %23, %56 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi32, #blocked>
+      %58 = tt.broadcast %53 : (tensor<1x64xi1, #blocked>) -> tensor<64x64xi1, #blocked>
+      %59 = tt.broadcast %54 : (tensor<1x64xi1, #blocked2>) -> tensor<64x64xi1, #blocked2>
+      %60 = tt.load %57, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
+      %61 = arith.addi %55, %25 : tensor<64x64xi32, #blocked>
+      %62 = tt.addptr %26, %61 : tensor<64x64x!tt.ptr<bf16, 1>, #blocked>, tensor<64x64xi32, #blocked>
+      %63 = tt.load %62, %58, %cst_16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xbf16, #blocked>
+      %64 = arith.extf %63 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked>
+      tt.assert %35, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
+      %65 = arith.extsi %51 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked>
+      %66 = tt.broadcast %65 : (tensor<1x64xi64, #blocked>) -> tensor<64x64xi64, #blocked>
+      %67 = arith.addi %66, %37 : tensor<64x64xi64, #blocked>
+      %68 = tt.addptr %38, %67 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi64, #blocked>
+      %69 = tt.load %68, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
+      %70 = arith.addf %69, %60 : tensor<64x64xf32, #blocked>
+      %71 = arith.addf %70, %64 : tensor<64x64xf32, #blocked>
+      %72 = arith.subf %71, %arg9 : tensor<64x64xf32, #blocked>
+      %73 = arith.addf %arg12, %cst_4 : tensor<64x64xf32, #blocked>
+      %74 = arith.addf %arg11, %cst_10 : tensor<64x64xf32, #blocked2>
+      %75 = arith.divf %72, %73 : tensor<64x64xf32, #blocked>
+      %76 = arith.addf %arg9, %75 : tensor<64x64xf32, #blocked>
+      %77 = arith.subf %71, %76 : tensor<64x64xf32, #blocked>
+      %78 = arith.mulf %72, %77 : tensor<64x64xf32, #blocked>
+      %79 = arith.addf %arg10, %78 : tensor<64x64xf32, #blocked>
+      %80 = arith.select %58, %76, %arg9 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
+      %81 = arith.select %58, %79, %arg10 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
+      %82 = arith.select %58, %73, %arg12 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
+      %83 = arith.select %59, %74, %arg11 : tensor<64x64xi1, #blocked2>, tensor<64x64xf32, #blocked2>
+      scf.yield %80, %81, %83, %82 : tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked>
+    }
+    %40 = triton_gpu.convert_layout %39#2 : (tensor<64x64xf32, #blocked2>) -> tensor<64x64xf32, #blocked>
+    %41:3 = "tt.reduce"(%39#0, %39#1, %40) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
+      %49 = arith.subf %arg11, %arg8 : f32
+      %50 = arith.addf %arg10, %arg13 : f32
+      %51 = arith.cmpf oeq, %50, %cst_11 : f32
+      %52 = arith.divf %arg13, %50 : f32
+      %53 = arith.select %51, %cst_11, %52 : f32
+      %54 = arith.mulf %49, %53 : f32
+      %55 = arith.addf %arg8, %54 : f32
+      %56 = arith.addf %arg9, %arg12 : f32
+      %57 = arith.mulf %49, %49 : f32
+      %58 = arith.mulf %57, %arg10 : f32
+      %59 = arith.mulf %58, %53 : f32
+      %60 = arith.addf %56, %59 : f32
+      tt.reduce.return %55, %60, %50 : f32, f32, f32
+    }) : (tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
+    %42 = tt.expand_dims %41#0 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
+    %43 = tt.expand_dims %41#1 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
+    %44 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x64x!tt.ptr<f32, 1>, #blocked>
+    %45 = tt.broadcast %42 : (tensor<64x1xf32, #blocked>) -> tensor<64x64xf32, #blocked>
+    %46 = arith.divf %43, %cst_15 : tensor<64x1xf32, #blocked>
+    %47 = arith.addf %46, %cst_14 : tensor<64x1xf32, #blocked>
+    %48 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>, #blocked>
+    scf.for %arg8 = %c0_i32 to %c256_i32 step %c64_i32  : i32 {
+      %49 = tt.splat %arg8 : (i32) -> tensor<1x64xi32, #blocked>
+      %50 = arith.addi %49, %12 : tensor<1x64xi32, #blocked>
+      %51 = arith.cmpi slt, %50, %cst_0 : tensor<1x64xi32, #blocked>
+      %52 = tt.broadcast %50 : (tensor<1x64xi32, #blocked>) -> tensor<64x64xi32, #blocked>
+      %53 = arith.addi %52, %22 : tensor<64x64xi32, #blocked>
+      %54 = tt.addptr %23, %53 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi32, #blocked>
+      %55 = tt.broadcast %51 : (tensor<1x64xi1, #blocked>) -> tensor<64x64xi1, #blocked>
+      %56 = tt.load %54, %55, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
+      %57 = arith.addi %52, %25 : tensor<64x64xi32, #blocked>
+      %58 = tt.addptr %26, %57 : tensor<64x64x!tt.ptr<bf16, 1>, #blocked>, tensor<64x64xi32, #blocked>
+      %59 = tt.load %58, %55, %cst_16 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xbf16, #blocked>
+      %60 = arith.extf %59 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked>
+      %61 = tt.addptr %44, %50 : tensor<1x64x!tt.ptr<f32, 1>, #blocked>, tensor<1x64xi32, #blocked>
+      %62 = tt.load %61, %51, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x64xf32, #blocked>
+      tt.assert %35, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
+      %63 = arith.extsi %50 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked>
+      %64 = tt.broadcast %63 : (tensor<1x64xi64, #blocked>) -> tensor<64x64xi64, #blocked>
+      %65 = arith.addi %64, %37 : tensor<64x64xi64, #blocked>
+      %66 = tt.addptr %38, %65 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi64, #blocked>
+      %67 = tt.load %66, %55, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
+      %68 = arith.addf %67, %56 : tensor<64x64xf32, #blocked>
+      %69 = arith.addf %68, %60 : tensor<64x64xf32, #blocked>
+      %70 = arith.subf %69, %45 : tensor<64x64xf32, #blocked>
+      %71 = tt.extern_elementwise %47 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked>) -> tensor<64x1xf32, #blocked>
+      %72 = tt.broadcast %71 : (tensor<64x1xf32, #blocked>) -> tensor<64x64xf32, #blocked>
+      %73 = arith.mulf %70, %72 : tensor<64x64xf32, #blocked>
+      %74 = tt.broadcast %62 : (tensor<1x64xf32, #blocked>) -> tensor<64x64xf32, #blocked>
+      %75 = arith.mulf %73, %74 : tensor<64x64xf32, #blocked>
+      %76 = tt.addptr %48, %57 : tensor<64x64x!tt.ptr<bf16, 1>, #blocked>, tensor<64x64xi32, #blocked>
+      %77 = arith.truncf %75 : tensor<64x64xf32, #blocked> to tensor<64x64xbf16, #blocked>
+      tt.store %76, %77, %55 {cache = 1 : i32, evict = 1 : i32} : tensor<64x64xbf16, #blocked>
+    }
+    tt.return
+  }
+}

.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.ttir ADDED Viewed

	@@ -0,0 +1,151 @@

+module {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<64x64xbf16>
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %cst_1 = arith.constant dense<1.000000e+00> : tensor<64x64xf32>
+    %c256_i32 = arith.constant 256 : i32
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_2 = arith.constant dense<256> : tensor<64x1xi64>
+    %cst_3 = arith.constant dense<0> : tensor<64x1xi64>
+    %cst_4 = arith.constant dense<50257> : tensor<64x1xi64>
+    %cst_5 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32>
+    %cst_6 = arith.constant dense<2.560000e+02> : tensor<64x1xf32>
+    %cst_7 = arith.constant dense<0.000000e+00> : tensor<1x64xf32>
+    %cst_8 = arith.constant dense<0.000000e+00> : tensor<64x64xf32>
+    %cst_9 = arith.constant dense<256> : tensor<64x1xi32>
+    %cst_10 = arith.constant dense<256> : tensor<1x64xi32>
+    %cst_11 = arith.constant dense<512> : tensor<64x1xi32>
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c64_i32 : i32
+    %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
+    %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
+    %5 = arith.addi %4, %3 : tensor<64x1xi32>
+    %6 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<64xi32>) -> tensor<1x64xi32>
+    %7 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
+    %8 = tt.addptr %7, %5 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
+    %9 = tt.load %8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
+    %10 = arith.remsi %5, %cst_11 : tensor<64x1xi32>
+    %11 = arith.muli %10, %cst_9 : tensor<64x1xi32>
+    %12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x64xi32>
+    %13 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
+    %14 = arith.muli %5, %cst_9 : tensor<64x1xi32>
+    %15 = tt.broadcast %14 : (tensor<64x1xi32>) -> tensor<64x64xi32>
+    %16 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>>
+    %17 = arith.addi %9, %cst_4 : tensor<64x1xi64>
+    %18 = arith.cmpi slt, %9, %cst_3 : tensor<64x1xi64>
+    %19 = arith.select %18, %17, %9 : tensor<64x1xi1>, tensor<64x1xi64>
+    %20 = arith.cmpi sge, %19, %cst_3 : tensor<64x1xi64>
+    %21 = arith.cmpi slt, %19, %cst_4 : tensor<64x1xi64>
+    %22 = arith.andi %20, %21 : tensor<64x1xi1>
+    %23 = arith.muli %19, %cst_2 : tensor<64x1xi64>
+    %24 = tt.broadcast %23 : (tensor<64x1xi64>) -> tensor<64x64xi64>
+    %25 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
+    %26:3 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c64_i32 iter_args(%arg9 = %cst_8, %arg10 = %cst_8, %arg11 = %cst_8) -> (tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>)  : i32 {
+      %50 = tt.splat %arg8 : (i32) -> tensor<1x64xi32>
+      %51 = arith.addi %50, %6 : tensor<1x64xi32>
+      %52 = arith.cmpi slt, %51, %cst_10 : tensor<1x64xi32>
+      %53 = tt.broadcast %51 : (tensor<1x64xi32>) -> tensor<64x64xi32>
+      %54 = arith.addi %53, %12 : tensor<64x64xi32>
+      %55 = tt.addptr %13, %54 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi32>
+      %56 = tt.broadcast %52 : (tensor<1x64xi1>) -> tensor<64x64xi1>
+      %57 = tt.load %55, %56, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32>
+      %58 = arith.addi %53, %15 : tensor<64x64xi32>
+      %59 = tt.addptr %16, %58 : tensor<64x64x!tt.ptr<bf16, 1>>, tensor<64x64xi32>
+      %60 = tt.load %59, %56, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xbf16>
+      %61 = arith.extf %60 : tensor<64x64xbf16> to tensor<64x64xf32>
+      tt.assert %22, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
+      %62 = arith.extsi %51 : tensor<1x64xi32> to tensor<1x64xi64>
+      %63 = tt.broadcast %62 : (tensor<1x64xi64>) -> tensor<64x64xi64>
+      %64 = arith.addi %63, %24 : tensor<64x64xi64>
+      %65 = tt.addptr %25, %64 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi64>
+      %66 = tt.load %65, %56, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32>
+      %67 = arith.addf %66, %57 : tensor<64x64xf32>
+      %68 = arith.addf %67, %61 : tensor<64x64xf32>
+      %69 = arith.subf %68, %arg9 : tensor<64x64xf32>
+      %70 = arith.addf %arg11, %cst_1 : tensor<64x64xf32>
+      %71 = arith.divf %69, %70 : tensor<64x64xf32>
+      %72 = arith.addf %arg9, %71 : tensor<64x64xf32>
+      %73 = arith.subf %68, %72 : tensor<64x64xf32>
+      %74 = arith.mulf %69, %73 : tensor<64x64xf32>
+      %75 = arith.addf %arg10, %74 : tensor<64x64xf32>
+      %76 = arith.select %56, %72, %arg9 : tensor<64x64xi1>, tensor<64x64xf32>
+      %77 = arith.select %56, %75, %arg10 : tensor<64x64xi1>, tensor<64x64xf32>
+      %78 = arith.select %56, %70, %arg11 : tensor<64x64xi1>, tensor<64x64xf32>
+      scf.yield %76, %77, %78 : tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>
+    }
+    %27:3 = "tt.reduce"(%26#0, %26#1, %26#2) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
+      %50 = arith.subf %arg11, %arg8 : f32
+      %51 = arith.addf %arg10, %arg13 : f32
+      %52 = arith.cmpf oeq, %51, %cst_0 : f32
+      %53 = arith.divf %arg13, %51 : f32
+      %54 = arith.select %52, %cst_0, %53 : f32
+      %55 = arith.mulf %50, %54 : f32
+      %56 = arith.addf %arg8, %55 : f32
+      %57 = arith.addf %arg9, %arg12 : f32
+      %58 = arith.mulf %50, %50 : f32
+      %59 = arith.mulf %58, %arg10 : f32
+      %60 = arith.mulf %59, %54 : f32
+      %61 = arith.addf %57, %60 : f32
+      tt.reduce.return %56, %61, %51 : f32, f32, f32
+    }) : (tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
+    %28 = tt.expand_dims %27#0 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
+    %29 = tt.expand_dims %27#1 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
+    %30 = arith.muli %10, %cst_9 : tensor<64x1xi32>
+    %31 = tt.broadcast %30 : (tensor<64x1xi32>) -> tensor<64x64xi32>
+    %32 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
+    %33 = arith.muli %5, %cst_9 : tensor<64x1xi32>
+    %34 = tt.broadcast %33 : (tensor<64x1xi32>) -> tensor<64x64xi32>
+    %35 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>>
+    %36 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x64x!tt.ptr<f32, 1>>
+    %37 = arith.addi %9, %cst_4 : tensor<64x1xi64>
+    %38 = arith.cmpi slt, %9, %cst_3 : tensor<64x1xi64>
+    %39 = arith.select %38, %37, %9 : tensor<64x1xi1>, tensor<64x1xi64>
+    %40 = arith.cmpi sge, %39, %cst_3 : tensor<64x1xi64>
+    %41 = arith.cmpi slt, %39, %cst_4 : tensor<64x1xi64>
+    %42 = arith.andi %40, %41 : tensor<64x1xi1>
+    %43 = arith.muli %39, %cst_2 : tensor<64x1xi64>
+    %44 = tt.broadcast %43 : (tensor<64x1xi64>) -> tensor<64x64xi64>
+    %45 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
+    %46 = tt.broadcast %28 : (tensor<64x1xf32>) -> tensor<64x64xf32>
+    %47 = arith.divf %29, %cst_6 : tensor<64x1xf32>
+    %48 = arith.addf %47, %cst_5 : tensor<64x1xf32>
+    %49 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>>
+    scf.for %arg8 = %c0_i32 to %c256_i32 step %c64_i32  : i32 {
+      %50 = tt.splat %arg8 : (i32) -> tensor<1x64xi32>
+      %51 = arith.addi %50, %6 : tensor<1x64xi32>
+      %52 = arith.cmpi slt, %51, %cst_10 : tensor<1x64xi32>
+      %53 = tt.broadcast %51 : (tensor<1x64xi32>) -> tensor<64x64xi32>
+      %54 = arith.addi %53, %31 : tensor<64x64xi32>
+      %55 = tt.addptr %32, %54 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi32>
+      %56 = tt.broadcast %52 : (tensor<1x64xi1>) -> tensor<64x64xi1>
+      %57 = tt.load %55, %56, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32>
+      %58 = arith.addi %53, %34 : tensor<64x64xi32>
+      %59 = tt.addptr %35, %58 : tensor<64x64x!tt.ptr<bf16, 1>>, tensor<64x64xi32>
+      %60 = tt.load %59, %56, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xbf16>
+      %61 = arith.extf %60 : tensor<64x64xbf16> to tensor<64x64xf32>
+      %62 = tt.addptr %36, %51 : tensor<1x64x!tt.ptr<f32, 1>>, tensor<1x64xi32>
+      %63 = tt.load %62, %52, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x64xf32>
+      tt.assert %42, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
+      %64 = arith.extsi %51 : tensor<1x64xi32> to tensor<1x64xi64>
+      %65 = tt.broadcast %64 : (tensor<1x64xi64>) -> tensor<64x64xi64>
+      %66 = arith.addi %65, %44 : tensor<64x64xi64>
+      %67 = tt.addptr %45, %66 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi64>
+      %68 = tt.load %67, %56, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32>
+      %69 = arith.addf %68, %57 : tensor<64x64xf32>
+      %70 = arith.addf %69, %61 : tensor<64x64xf32>
+      %71 = arith.subf %70, %46 : tensor<64x64xf32>
+      %72 = tt.extern_elementwise %48 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32>
+      %73 = tt.broadcast %72 : (tensor<64x1xf32>) -> tensor<64x64xf32>
+      %74 = arith.mulf %71, %73 : tensor<64x64xf32>
+      %75 = tt.broadcast %63 : (tensor<1x64xf32>) -> tensor<64x64xf32>
+      %76 = arith.mulf %74, %75 : tensor<64x64xf32>
+      %77 = tt.addptr %49, %58 : tensor<64x64x!tt.ptr<bf16, 1>>, tensor<64x64xi32>
+      %78 = arith.truncf %76 : tensor<64x64xf32> to tensor<64x64xbf16>
+      tt.store %77, %78, %56 {cache = 1 : i32, evict = 1 : i32} : tensor<64x64xbf16>
+    }
+    tt.return
+  }
+}

.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,169 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<512> : tensor<2x1xi32, #blocked>
+    %cst_0 = arith.constant dense<256> : tensor<1x128xi32, #blocked>
+    %cst_1 = arith.constant dense<256> : tensor<2x1xi32, #blocked>
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<2x128xf32, #blocked>
+    %cst_3 = arith.constant dense<1.000000e+00> : tensor<2x128xf32, #blocked>
+    %cst_4 = arith.constant dense<256> : tensor<2x1xi64, #blocked>
+    %cst_5 = arith.constant dense<0> : tensor<2x1xi64, #blocked>
+    %cst_6 = arith.constant dense<50257> : tensor<2x1xi64, #blocked>
+    %cst_7 = arith.constant dense<50257> : tensor<2x1xi64, #blocked1>
+    %cst_8 = arith.constant dense<0> : tensor<2x1xi64, #blocked1>
+    %c0_i32 = arith.constant 0 : i32
+    %c128_i32 = arith.constant 128 : i32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_9 = arith.constant dense<1.000000e+00> : tensor<2x128xf32, #blocked2>
+    %cst_10 = arith.constant 0.000000e+00 : f32
+    %cst_11 = arith.constant dense<0.000000e+00> : tensor<1x128xf32, #blocked2>
+    %cst_12 = arith.constant dense<0.000000e+00> : tensor<2x128xf32, #blocked2>
+    %cst_13 = arith.constant dense<256> : tensor<1x128xi32, #blocked2>
+    %cst_14 = arith.constant dense<9.99999974E-6> : tensor<2x1xf32, #blocked>
+    %cst_15 = arith.constant dense<2.560000e+02> : tensor<2x1xf32, #blocked>
+    %cst_16 = arith.constant dense<0.000000e+00> : tensor<2x128xbf16, #blocked>
+    %c2_i32 = arith.constant 2 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c2_i32 : i32
+    %2 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %3 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xi32, #blocked>
+    %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<2x1xi32, #blocked1>
+    %6 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked>
+    %7 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked1>
+    %8 = arith.addi %6, %4 : tensor<2x1xi32, #blocked>
+    %9 = arith.addi %7, %5 : tensor<2x1xi32, #blocked1>
+    %10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
+    %11 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
+    %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x128xi32, #blocked>
+    %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x128xi32, #blocked2>
+    %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked>
+    %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked1>
+    %16 = tt.addptr %14, %8 : tensor<2x1x!tt.ptr<i64, 1>, #blocked>, tensor<2x1xi32, #blocked>
+    %17 = tt.addptr %15, %9 : tensor<2x1x!tt.ptr<i64, 1>, #blocked1>, tensor<2x1xi32, #blocked1>
+    %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked>
+    %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked1>
+    %20 = arith.remsi %8, %cst : tensor<2x1xi32, #blocked>
+    %21 = arith.muli %20, %cst_1 : tensor<2x1xi32, #blocked>
+    %22 = tt.broadcast %21 : (tensor<2x1xi32, #blocked>) -> tensor<2x128xi32, #blocked>
+    %23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<2x128x!tt.ptr<f32, 1>, #blocked>
+    %24 = arith.muli %8, %cst_1 : tensor<2x1xi32, #blocked>
+    %25 = tt.broadcast %24 : (tensor<2x1xi32, #blocked>) -> tensor<2x128xi32, #blocked>
+    %26 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<2x128x!tt.ptr<bf16, 1>, #blocked>
+    %27 = arith.addi %18, %cst_6 : tensor<2x1xi64, #blocked>
+    %28 = arith.addi %19, %cst_7 : tensor<2x1xi64, #blocked1>
+    %29 = arith.cmpi slt, %18, %cst_5 : tensor<2x1xi64, #blocked>
+    %30 = arith.cmpi slt, %19, %cst_8 : tensor<2x1xi64, #blocked1>
+    %31 = arith.select %29, %27, %18 : tensor<2x1xi1, #blocked>, tensor<2x1xi64, #blocked>
+    %32 = arith.select %30, %28, %19 : tensor<2x1xi1, #blocked1>, tensor<2x1xi64, #blocked1>
+    %33 = arith.cmpi sge, %32, %cst_8 : tensor<2x1xi64, #blocked1>
+    %34 = arith.cmpi slt, %32, %cst_7 : tensor<2x1xi64, #blocked1>
+    %35 = arith.andi %33, %34 : tensor<2x1xi1, #blocked1>
+    %36 = arith.muli %31, %cst_4 : tensor<2x1xi64, #blocked>
+    %37 = tt.broadcast %36 : (tensor<2x1xi64, #blocked>) -> tensor<2x128xi64, #blocked>
+    %38 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<2x128x!tt.ptr<f32, 1>, #blocked>
+    %39:4 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c128_i32 iter_args(%arg9 = %cst_2, %arg10 = %cst_2, %arg11 = %cst_12, %arg12 = %cst_2) -> (tensor<2x128xf32, #blocked>, tensor<2x128xf32, #blocked>, tensor<2x128xf32, #blocked2>, tensor<2x128xf32, #blocked>)  : i32 {
+      %49 = tt.splat %arg8 : (i32) -> tensor<1x128xi32, #blocked>
+      %50 = tt.splat %arg8 : (i32) -> tensor<1x128xi32, #blocked2>
+      %51 = arith.addi %49, %12 : tensor<1x128xi32, #blocked>
+      %52 = arith.addi %50, %13 : tensor<1x128xi32, #blocked2>
+      %53 = arith.cmpi slt, %51, %cst_0 : tensor<1x128xi32, #blocked>
+      %54 = arith.cmpi slt, %52, %cst_13 : tensor<1x128xi32, #blocked2>
+      %55 = tt.broadcast %51 : (tensor<1x128xi32, #blocked>) -> tensor<2x128xi32, #blocked>
+      %56 = arith.addi %55, %22 : tensor<2x128xi32, #blocked>
+      %57 = tt.addptr %23, %56 : tensor<2x128x!tt.ptr<f32, 1>, #blocked>, tensor<2x128xi32, #blocked>
+      %58 = tt.broadcast %53 : (tensor<1x128xi1, #blocked>) -> tensor<2x128xi1, #blocked>
+      %59 = tt.broadcast %54 : (tensor<1x128xi1, #blocked2>) -> tensor<2x128xi1, #blocked2>
+      %60 = tt.load %57, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x128xf32, #blocked>
+      %61 = arith.addi %55, %25 : tensor<2x128xi32, #blocked>
+      %62 = tt.addptr %26, %61 : tensor<2x128x!tt.ptr<bf16, 1>, #blocked>, tensor<2x128xi32, #blocked>
+      %63 = tt.load %62, %58, %cst_16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x128xbf16, #blocked>
+      %64 = arith.extf %63 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked>
+      tt.assert %35, "index out of bounds: 0 <= tmp3 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "<module>", 1892 : tensor<2x1xi1, #blocked1>
+      %65 = arith.extsi %51 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked>
+      %66 = tt.broadcast %65 : (tensor<1x128xi64, #blocked>) -> tensor<2x128xi64, #blocked>
+      %67 = arith.addi %66, %37 : tensor<2x128xi64, #blocked>
+      %68 = tt.addptr %38, %67 : tensor<2x128x!tt.ptr<f32, 1>, #blocked>, tensor<2x128xi64, #blocked>
+      %69 = tt.load %68, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x128xf32, #blocked>
+      %70 = arith.addf %69, %60 : tensor<2x128xf32, #blocked>
+      %71 = arith.addf %70, %64 : tensor<2x128xf32, #blocked>
+      %72 = arith.subf %71, %arg9 : tensor<2x128xf32, #blocked>
+      %73 = arith.addf %arg12, %cst_3 : tensor<2x128xf32, #blocked>
+      %74 = arith.addf %arg11, %cst_9 : tensor<2x128xf32, #blocked2>
+      %75 = arith.divf %72, %73 : tensor<2x128xf32, #blocked>
+      %76 = arith.addf %arg9, %75 : tensor<2x128xf32, #blocked>
+      %77 = arith.subf %71, %76 : tensor<2x128xf32, #blocked>
+      %78 = arith.mulf %72, %77 : tensor<2x128xf32, #blocked>
+      %79 = arith.addf %arg10, %78 : tensor<2x128xf32, #blocked>
+      %80 = arith.select %58, %76, %arg9 : tensor<2x128xi1, #blocked>, tensor<2x128xf32, #blocked>
+      %81 = arith.select %58, %79, %arg10 : tensor<2x128xi1, #blocked>, tensor<2x128xf32, #blocked>
+      %82 = arith.select %58, %73, %arg12 : tensor<2x128xi1, #blocked>, tensor<2x128xf32, #blocked>
+      %83 = arith.select %59, %74, %arg11 : tensor<2x128xi1, #blocked2>, tensor<2x128xf32, #blocked2>
+      scf.yield %80, %81, %83, %82 : tensor<2x128xf32, #blocked>, tensor<2x128xf32, #blocked>, tensor<2x128xf32, #blocked2>, tensor<2x128xf32, #blocked>
+    }
+    %40 = triton_gpu.convert_layout %39#2 : (tensor<2x128xf32, #blocked2>) -> tensor<2x128xf32, #blocked>
+    %41:3 = "tt.reduce"(%39#0, %39#1, %40) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
+      %49 = arith.subf %arg11, %arg8 : f32
+      %50 = arith.addf %arg10, %arg13 : f32
+      %51 = arith.cmpf oeq, %50, %cst_10 : f32
+      %52 = arith.divf %arg13, %50 : f32
+      %53 = arith.select %51, %cst_10, %52 : f32
+      %54 = arith.mulf %49, %53 : f32
+      %55 = arith.addf %arg8, %54 : f32
+      %56 = arith.addf %arg9, %arg12 : f32
+      %57 = arith.mulf %49, %49 : f32
+      %58 = arith.mulf %57, %arg10 : f32
+      %59 = arith.mulf %58, %53 : f32
+      %60 = arith.addf %56, %59 : f32
+      tt.reduce.return %55, %60, %50 : f32, f32, f32
+    }) : (tensor<2x128xf32, #blocked>, tensor<2x128xf32, #blocked>, tensor<2x128xf32, #blocked>) -> (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
+    %42 = tt.expand_dims %41#0 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
+    %43 = tt.expand_dims %41#1 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
+    %44 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x128x!tt.ptr<f32, 1>, #blocked2>
+    %45 = tt.broadcast %42 : (tensor<2x1xf32, #blocked>) -> tensor<2x128xf32, #blocked>
+    %46 = arith.divf %43, %cst_15 : tensor<2x1xf32, #blocked>
+    %47 = arith.addf %46, %cst_14 : tensor<2x1xf32, #blocked>
+    %48 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<2x128x!tt.ptr<bf16, 1>, #blocked>
+    scf.for %arg8 = %c0_i32 to %c256_i32 step %c128_i32  : i32 {
+      %49 = tt.splat %arg8 : (i32) -> tensor<1x128xi32, #blocked>
+      %50 = tt.splat %arg8 : (i32) -> tensor<1x128xi32, #blocked2>
+      %51 = arith.addi %49, %12 : tensor<1x128xi32, #blocked>
+      %52 = arith.addi %50, %13 : tensor<1x128xi32, #blocked2>
+      %53 = arith.cmpi slt, %51, %cst_0 : tensor<1x128xi32, #blocked>
+      %54 = arith.cmpi slt, %52, %cst_13 : tensor<1x128xi32, #blocked2>
+      %55 = tt.broadcast %51 : (tensor<1x128xi32, #blocked>) -> tensor<2x128xi32, #blocked>
+      %56 = arith.addi %55, %22 : tensor<2x128xi32, #blocked>
+      %57 = tt.addptr %23, %56 : tensor<2x128x!tt.ptr<f32, 1>, #blocked>, tensor<2x128xi32, #blocked>
+      %58 = tt.broadcast %53 : (tensor<1x128xi1, #blocked>) -> tensor<2x128xi1, #blocked>
+      %59 = tt.load %57, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x128xf32, #blocked>
+      %60 = arith.addi %55, %25 : tensor<2x128xi32, #blocked>
+      %61 = tt.addptr %26, %60 : tensor<2x128x!tt.ptr<bf16, 1>, #blocked>, tensor<2x128xi32, #blocked>
+      %62 = tt.load %61, %58, %cst_16 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x128xbf16, #blocked>
+      %63 = arith.extf %62 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked>
+      %64 = tt.addptr %44, %52 : tensor<1x128x!tt.ptr<f32, 1>, #blocked2>, tensor<1x128xi32, #blocked2>
+      %65 = tt.load %64, %54, %cst_11 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x128xf32, #blocked2>
+      tt.assert %35, "index out of bounds: 0 <= tmp16 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "<module>", 1892 : tensor<2x1xi1, #blocked1>
+      %66 = arith.extsi %51 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked>
+      %67 = tt.broadcast %66 : (tensor<1x128xi64, #blocked>) -> tensor<2x128xi64, #blocked>
+      %68 = arith.addi %67, %37 : tensor<2x128xi64, #blocked>
+      %69 = tt.addptr %38, %68 : tensor<2x128x!tt.ptr<f32, 1>, #blocked>, tensor<2x128xi64, #blocked>
+      %70 = tt.load %69, %58, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x128xf32, #blocked>
+      %71 = arith.addf %70, %59 : tensor<2x128xf32, #blocked>
+      %72 = arith.addf %71, %63 : tensor<2x128xf32, #blocked>
+      %73 = arith.subf %72, %45 : tensor<2x128xf32, #blocked>
+      %74 = tt.extern_elementwise %47 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32, #blocked>) -> tensor<2x1xf32, #blocked>
+      %75 = tt.broadcast %74 : (tensor<2x1xf32, #blocked>) -> tensor<2x128xf32, #blocked>
+      %76 = arith.mulf %73, %75 : tensor<2x128xf32, #blocked>
+      %77 = triton_gpu.convert_layout %65 : (tensor<1x128xf32, #blocked2>) -> tensor<1x128xf32, #blocked>
+      %78 = tt.broadcast %77 : (tensor<1x128xf32, #blocked>) -> tensor<2x128xf32, #blocked>
+      %79 = arith.mulf %76, %78 : tensor<2x128xf32, #blocked>
+      %80 = tt.addptr %48, %60 : tensor<2x128x!tt.ptr<bf16, 1>, #blocked>, tensor<2x128xi32, #blocked>
+      %81 = arith.truncf %79 : tensor<2x128xf32, #blocked> to tensor<2x128xbf16, #blocked>
+      tt.store %80, %81, %58 {cache = 1 : i32, evict = 1 : i32} : tensor<2x128xbf16, #blocked>
+    }
+    tt.return
+  }
+}

.triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,110 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6e7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg6: i64 {tt.max_divisibility = 8 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<7680> : tensor<1x2048xi64, #blocked>
+    %cst_0 = arith.constant dense<7680> : tensor<1x2048xi64, #blocked1>
+    %cst_1 = arith.constant dense<50257> : tensor<1x2048xi64, #blocked>
+    %c385973760_i64 = arith.constant 385973760 : i64
+    %c7680_i64 = arith.constant 7680 : i64
+    %c8_i64 = arith.constant 8 : i64
+    %cst_2 = arith.constant dense<-1> : tensor<1x2048xi64, #blocked>
+    %cst_3 = arith.constant dense<0> : tensor<1x2048xi64, #blocked>
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked1>
+    %cst_5 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked>
+    %cst_6 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked1>
+    %c0_i32 = arith.constant 0 : i32
+    %c7680_i32 = arith.constant 7680 : i32
+    %c2048_i32 = arith.constant 2048 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.extsi %0 : i32 to i64
+    %2 = arith.cmpi slt, %1, %c8_i64 : i64
+    %3 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
+    %4 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
+    %5 = tt.expand_dims %3 {axis = 0 : i32} : (tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x2048xi32, #blocked>
+    %6 = tt.expand_dims %4 {axis = 0 : i32} : (tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x2048xi32, #blocked1>
+    %7 = arith.extsi %5 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked>
+    %8 = arith.extsi %6 : tensor<1x2048xi32, #blocked1> to tensor<1x2048xi64, #blocked1>
+    %9 = arith.muli %1, %c7680_i64 : i64
+    %10 = tt.splat %9 : (i64) -> tensor<1x2048xi64, #blocked>
+    %11 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<1x2048x!tt.ptr<i64, 1>, #blocked>
+    %12 = tt.splat %2 : (i1) -> tensor<1x2048xi1, #blocked>
+    %13 = tt.splat %2 : (i1) -> tensor<1x2048xi1, #blocked1>
+    %14 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<1x2048x!tt.ptr<f32, 1>, #blocked>
+    %15 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x2048x!tt.ptr<f32, 1>, #blocked>
+    %16 = arith.muli %1, %c385973760_i64 : i64
+    %17 = tt.splat %16 : (i64) -> tensor<1x2048xi64, #blocked>
+    %18 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>
+    %19:2 = scf.for %arg8 = %c0_i32 to %c7680_i32 step %c2048_i32 iter_args(%arg9 = %cst_4, %arg10 = %cst_3) -> (tensor<1x2048xf32, #blocked1>, tensor<1x2048xi64, #blocked>)  : i32 {
+      %30 = arith.extsi %arg8 : i32 to i64
+      %31 = tt.splat %30 : (i64) -> tensor<1x2048xi64, #blocked>
+      %32 = tt.splat %30 : (i64) -> tensor<1x2048xi64, #blocked1>
+      %33 = arith.addi %31, %7 : tensor<1x2048xi64, #blocked>
+      %34 = arith.addi %32, %8 : tensor<1x2048xi64, #blocked1>
+      %35 = arith.cmpi slt, %33, %cst : tensor<1x2048xi64, #blocked>
+      %36 = arith.cmpi slt, %34, %cst_0 : tensor<1x2048xi64, #blocked1>
+      %37 = arith.addi %33, %10 : tensor<1x2048xi64, #blocked>
+      %38 = tt.addptr %11, %37 : tensor<1x2048x!tt.ptr<i64, 1>, #blocked>, tensor<1x2048xi64, #blocked>
+      %39 = arith.andi %35, %12 : tensor<1x2048xi1, #blocked>
+      %40 = arith.andi %36, %13 : tensor<1x2048xi1, #blocked1>
+      %41 = tt.load %38, %39, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xi64, #blocked>
+      %42 = tt.addptr %14, %37 : tensor<1x2048x!tt.ptr<f32, 1>, #blocked>, tensor<1x2048xi64, #blocked>
+      %43 = tt.load %42, %39, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xf32, #blocked>
+      %44 = triton_gpu.convert_layout %43 : (tensor<1x2048xf32, #blocked>) -> tensor<1x2048xf32, #blocked1>
+      %45 = tt.addptr %15, %37 : tensor<1x2048x!tt.ptr<f32, 1>, #blocked>, tensor<1x2048xi64, #blocked>
+      %46 = tt.load %45, %39, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xf32, #blocked>
+      %47 = arith.cmpi ne, %41, %cst_2 : tensor<1x2048xi64, #blocked>
+      %48 = triton_gpu.convert_layout %47 : (tensor<1x2048xi1, #blocked>) -> tensor<1x2048xi1, #blocked1>
+      %49 = arith.select %47, %41, %cst_3 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked>
+      %50 = arith.addi %49, %cst_1 : tensor<1x2048xi64, #blocked>
+      %51 = arith.cmpi slt, %49, %cst_3 : tensor<1x2048xi64, #blocked>
+      %52 = arith.select %51, %50, %49 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked>
+      %53 = arith.cmpi sge, %52, %cst_3 : tensor<1x2048xi64, #blocked>
+      %54 = arith.cmpi slt, %52, %cst_1 : tensor<1x2048xi64, #blocked>
+      %55 = arith.andi %53, %54 : tensor<1x2048xi1, #blocked>
+      %56 = triton_gpu.convert_layout %55 : (tensor<1x2048xi1, #blocked>) -> tensor<1x2048xi1, #blocked2>
+      tt.assert %56, "index out of bounds: 0 <= tmp7 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1x2048xi1, #blocked2>
+      %57 = arith.muli %33, %cst_1 : tensor<1x2048xi64, #blocked>
+      %58 = arith.addi %52, %57 : tensor<1x2048xi64, #blocked>
+      %59 = arith.addi %58, %17 : tensor<1x2048xi64, #blocked>
+      %60 = tt.addptr %18, %59 : tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>, tensor<1x2048xi64, #blocked>
+      %61 = triton_gpu.convert_layout %60 : (tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>) -> tensor<1x2048x!tt.ptr<bf16, 1>, #blocked1>
+      %62 = tt.load %61, %40, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x2048xbf16, #blocked1>
+      %63 = arith.extf %62 : tensor<1x2048xbf16, #blocked1> to tensor<1x2048xf32, #blocked1>
+      %64 = arith.subf %63, %44 : tensor<1x2048xf32, #blocked1>
+      %65 = math.log %46 : tensor<1x2048xf32, #blocked>
+      %66 = triton_gpu.convert_layout %65 : (tensor<1x2048xf32, #blocked>) -> tensor<1x2048xf32, #blocked1>
+      %67 = arith.subf %64, %66 : tensor<1x2048xf32, #blocked1>
+      %68 = arith.subf %cst_4, %67 : tensor<1x2048xf32, #blocked1>
+      %69 = arith.select %48, %68, %cst_4 : tensor<1x2048xi1, #blocked1>, tensor<1x2048xf32, #blocked1>
+      %70 = arith.addf %arg9, %69 : tensor<1x2048xf32, #blocked1>
+      %71 = arith.select %40, %70, %arg9 : tensor<1x2048xi1, #blocked1>, tensor<1x2048xf32, #blocked1>
+      %72 = arith.extui %47 : tensor<1x2048xi1, #blocked> to tensor<1x2048xi64, #blocked>
+      %73 = arith.addi %arg10, %72 : tensor<1x2048xi64, #blocked>
+      %74 = arith.select %39, %73, %arg10 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked>
+      scf.yield %71, %74 : tensor<1x2048xf32, #blocked1>, tensor<1x2048xi64, #blocked>
+    }
+    %20 = "tt.reduce"(%19#0) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32):
+      %30 = arith.addf %arg8, %arg9 : f32
+      tt.reduce.return %30 : f32
+    }) : (tensor<1x2048xf32, #blocked1>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %21 = tt.expand_dims %20 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<1x1xf32, #blocked1>
+    %22 = tt.addptr %arg4, %1 : !tt.ptr<f32, 1>, i64
+    %23 = tt.splat %22 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>, #blocked1>
+    %24 = tt.splat %2 : (i1) -> tensor<1x1xi1, #blocked1>
+    tt.store %23, %21, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32, #blocked1>
+    %25 = "tt.reduce"(%19#1) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: i64, %arg9: i64):
+      %30 = arith.addi %arg8, %arg9 : i64
+      tt.reduce.return %30 : i64
+    }) : (tensor<1x2048xi64, #blocked>) -> tensor<1xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %26 = triton_gpu.convert_layout %25 : (tensor<1xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1xi64, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %27 = tt.expand_dims %26 {axis = 1 : i32} : (tensor<1xi64, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<1x1xi64, #blocked1>
+    %28 = tt.addptr %arg5, %1 : !tt.ptr<i64, 1>, i64
+    %29 = tt.splat %28 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked1>
+    tt.store %29, %27, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64, #blocked1>
+    tt.return
+  }
+}

.triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.ttir ADDED Viewed

	@@ -0,0 +1,91 @@

+module {
+  tt.func public @triton__0d1d2d3d4d5d6e7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg6: i64 {tt.max_divisibility = 8 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c8_i64 = arith.constant 8 : i64
+    %c7680_i64 = arith.constant 7680 : i64
+    %c385973760_i64 = arith.constant 385973760 : i64
+    %cst = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16>
+    %cst_0 = arith.constant dense<50257> : tensor<1x2048xi64>
+    %cst_1 = arith.constant dense<7680> : tensor<1x2048xi64>
+    %c2048_i32 = arith.constant 2048 : i32
+    %c7680_i32 = arith.constant 7680 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_2 = arith.constant dense<-1> : tensor<1x2048xi64>
+    %cst_3 = arith.constant dense<0> : tensor<1x2048xi64>
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32>
+    %0 = tt.get_program_id x : i32
+    %1 = arith.extsi %0 : i32 to i64
+    %2 = arith.cmpi slt, %1, %c8_i64 : i64
+    %3 = tt.splat %2 : (i1) -> tensor<1x1xi1>
+    %4 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32>
+    %5 = tt.expand_dims %4 {axis = 0 : i32} : (tensor<2048xi32>) -> tensor<1x2048xi32>
+    %6 = arith.extsi %5 : tensor<1x2048xi32> to tensor<1x2048xi64>
+    %7 = arith.muli %1, %c7680_i64 : i64
+    %8 = tt.splat %7 : (i64) -> tensor<1x2048xi64>
+    %9 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<1x2048x!tt.ptr<i64, 1>>
+    %10 = tt.splat %2 : (i1) -> tensor<1x2048xi1>
+    %11 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<1x2048x!tt.ptr<f32, 1>>
+    %12 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x2048x!tt.ptr<f32, 1>>
+    %13 = arith.muli %1, %c385973760_i64 : i64
+    %14 = tt.splat %13 : (i64) -> tensor<1x2048xi64>
+    %15 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>>
+    %16:2 = scf.for %arg8 = %c0_i32 to %c7680_i32 step %c2048_i32 iter_args(%arg9 = %cst_4, %arg10 = %cst_3) -> (tensor<1x2048xf32>, tensor<1x2048xi64>)  : i32 {
+      %25 = arith.extsi %arg8 : i32 to i64
+      %26 = tt.splat %25 : (i64) -> tensor<1x2048xi64>
+      %27 = arith.addi %26, %6 : tensor<1x2048xi64>
+      %28 = arith.cmpi slt, %27, %cst_1 : tensor<1x2048xi64>
+      %29 = arith.addi %27, %8 : tensor<1x2048xi64>
+      %30 = tt.addptr %9, %29 : tensor<1x2048x!tt.ptr<i64, 1>>, tensor<1x2048xi64>
+      %31 = arith.andi %28, %10 : tensor<1x2048xi1>
+      %32 = tt.load %30, %31, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xi64>
+      %33 = tt.addptr %11, %29 : tensor<1x2048x!tt.ptr<f32, 1>>, tensor<1x2048xi64>
+      %34 = tt.load %33, %31, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xf32>
+      %35 = tt.addptr %12, %29 : tensor<1x2048x!tt.ptr<f32, 1>>, tensor<1x2048xi64>
+      %36 = tt.load %35, %31, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xf32>
+      %37 = arith.cmpi ne, %32, %cst_2 : tensor<1x2048xi64>
+      %38 = arith.select %37, %32, %cst_3 : tensor<1x2048xi1>, tensor<1x2048xi64>
+      %39 = arith.addi %38, %cst_0 : tensor<1x2048xi64>
+      %40 = arith.cmpi slt, %38, %cst_3 : tensor<1x2048xi64>
+      %41 = arith.select %40, %39, %38 : tensor<1x2048xi1>, tensor<1x2048xi64>
+      %42 = arith.cmpi sge, %41, %cst_3 : tensor<1x2048xi64>
+      %43 = arith.cmpi slt, %41, %cst_0 : tensor<1x2048xi64>
+      %44 = arith.andi %42, %43 : tensor<1x2048xi1>
+      tt.assert %44, "index out of bounds: 0 <= tmp7 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1x2048xi1>
+      %45 = arith.muli %27, %cst_0 : tensor<1x2048xi64>
+      %46 = arith.addi %41, %45 : tensor<1x2048xi64>
+      %47 = arith.addi %46, %14 : tensor<1x2048xi64>
+      %48 = tt.addptr %15, %47 : tensor<1x2048x!tt.ptr<bf16, 1>>, tensor<1x2048xi64>
+      %49 = tt.load %48, %31, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x2048xbf16>
+      %50 = arith.extf %49 : tensor<1x2048xbf16> to tensor<1x2048xf32>
+      %51 = arith.subf %50, %34 : tensor<1x2048xf32>
+      %52 = math.log %36 : tensor<1x2048xf32>
+      %53 = arith.subf %51, %52 : tensor<1x2048xf32>
+      %54 = arith.subf %cst_4, %53 : tensor<1x2048xf32>
+      %55 = arith.select %37, %54, %cst_4 : tensor<1x2048xi1>, tensor<1x2048xf32>
+      %56 = arith.addf %arg9, %55 : tensor<1x2048xf32>
+      %57 = arith.select %31, %56, %arg9 : tensor<1x2048xi1>, tensor<1x2048xf32>
+      %58 = arith.extui %37 : tensor<1x2048xi1> to tensor<1x2048xi64>
+      %59 = arith.addi %arg10, %58 : tensor<1x2048xi64>
+      %60 = arith.select %31, %59, %arg10 : tensor<1x2048xi1>, tensor<1x2048xi64>
+      scf.yield %57, %60 : tensor<1x2048xf32>, tensor<1x2048xi64>
+    }
+    %17 = "tt.reduce"(%16#0) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32):
+      %25 = arith.addf %arg8, %arg9 : f32
+      tt.reduce.return %25 : f32
+    }) : (tensor<1x2048xf32>) -> tensor<1xf32>
+    %18 = tt.expand_dims %17 {axis = 1 : i32} : (tensor<1xf32>) -> tensor<1x1xf32>
+    %19 = tt.addptr %arg4, %1 : !tt.ptr<f32, 1>, i64
+    %20 = tt.splat %19 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>>
+    tt.store %20, %18, %3 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32>
+    %21 = "tt.reduce"(%16#1) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: i64, %arg9: i64):
+      %25 = arith.addi %arg8, %arg9 : i64
+      tt.reduce.return %25 : i64
+    }) : (tensor<1x2048xi64>) -> tensor<1xi64>
+    %22 = tt.expand_dims %21 {axis = 1 : i32} : (tensor<1xi64>) -> tensor<1x1xi64>
+    %23 = tt.addptr %arg5, %1 : !tt.ptr<i64, 1>, i64
+    %24 = tt.splat %23 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>>
+    tt.store %24, %22, %3 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64>
+    tt.return
+  }
+}

.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,68 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
+    %cst_0 = arith.constant 9.99999974E-6 : f32
+    %cst_1 = arith.constant 2.560000e+02 : f32
+    %cst_2 = arith.constant 0.000000e+00 : f32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
+    %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
+    %3 = arith.muli %0, %c256_i32 : i32
+    %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
+    %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
+    %6 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %9 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %14 = tt.addptr %13, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %15 = tt.load %14, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %16 = arith.addf %8, %12 : tensor<256xf32, #blocked>
+    %17 = arith.select %2, %16, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %18 = "tt.reduce"(%17) <{axis = 0 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32):
+      %42 = arith.addf %arg8, %arg9 : f32
+      tt.reduce.return %42 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %19 = arith.addf %18, %cst_2 : f32
+    %20 = arith.divf %19, %cst_1 : f32
+    %21 = tt.splat %20 : (f32) -> tensor<1xf32, #blocked1>
+    %22 = tt.splat %20 : (f32) -> tensor<256xf32, #blocked>
+    %23 = arith.subf %16, %22 : tensor<256xf32, #blocked>
+    %24 = arith.mulf %23, %23 : tensor<256xf32, #blocked>
+    %25 = arith.select %2, %24, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %26 = "tt.reduce"(%25) <{axis = 0 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32):
+      %42 = arith.addf %arg8, %arg9 : f32
+      tt.reduce.return %42 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %27 = arith.addf %26, %cst_2 : f32
+    %28 = arith.divf %27, %cst_1 : f32
+    %29 = arith.addf %28, %cst_0 : f32
+    %30 = tt.extern_elementwise %29 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
+    %31 = tt.splat %30 : (f32) -> tensor<1xf32, #blocked1>
+    %32 = tt.splat %30 : (f32) -> tensor<256xf32, #blocked>
+    %33 = arith.mulf %23, %32 : tensor<256xf32, #blocked>
+    %34 = arith.mulf %33, %15 : tensor<256xf32, #blocked>
+    gpu.barrier
+    %35 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
+    %36 = tt.splat %35 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
+    tt.store %36, %31 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
+    %37 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %38 = tt.addptr %37, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %39 = arith.truncf %34 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
+    tt.store %38, %39, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
+    %40 = tt.addptr %arg4, %0 : !tt.ptr<f32, 1>, i32
+    %41 = tt.splat %40 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
+    tt.store %41, %21 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
+    tt.return
+  }
+}