Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .local/share/jupyter/nbextensions/help_panel/img/handle-v.png +0 -0
- .triton/dump/0471aff594c8c8b8715b81c529738739/triton_.llir +523 -0
- .triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ttgir +165 -0
- .triton/dump/0ef13ec90cf21db4d33a072ff09ec2d4/triton_.cubin +0 -0
- .triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ptx +764 -0
- .triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttgir +26 -0
- .triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttir +25 -0
- .triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.cubin +0 -0
- .triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttgir +60 -0
- .triton/dump/199215289adb100508718a5a762ba4d7/triton_.llir +184 -0
- .triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.cubin +0 -0
- .triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ttir +18 -0
- .triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.cubin +0 -0
- .triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.llir +332 -0
- .triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.ttir +25 -0
- .triton/dump/246118bec10f09cdce32d0be7c22b5ae/triton_.ptx +278 -0
- .triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.cubin +0 -0
- .triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.llir +162 -0
- .triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ptx +338 -0
- .triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ttgir +24 -0
- .triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ttir +18 -0
- .triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.cubin +0 -0
- .triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.llir +132 -0
- .triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ttgir +62 -0
- .triton/dump/415aac87553b7d064f52694fa7254686/triton_.cubin +0 -0
- .triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttgir +62 -0
- .triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.cubin +0 -0
- .triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.ptx +465 -0
- .triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.cubin +0 -0
- .triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.llir +424 -0
- .triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.ptx +921 -0
- .triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.ttgir +81 -0
- .triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.ttir +88 -0
- .triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.cubin +0 -0
- .triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.llir +55 -0
- .triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.ptx +297 -0
- .triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.ttgir +21 -0
- .triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.ttir +20 -0
- .triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.cubin +0 -0
- .triton/dump/884b5df35d2a25fd91308249e7657806/triton_.cubin +0 -0
- .triton/dump/884b5df35d2a25fd91308249e7657806/triton_.llir +48 -0
- .triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ptx +280 -0
- .triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ttgir +18 -0
- .triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ttir +17 -0
- .triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.llir +166 -0
- .triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ptx +342 -0
- .triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ttgir +28 -0
- .triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ttir +20 -0
- .triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.cubin +0 -0
- .triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.llir +476 -0
.local/share/jupyter/nbextensions/help_panel/img/handle-v.png
ADDED
.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.llir
ADDED
@@ -0,0 +1,523 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
|
5 |
+
@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
|
6 |
+
@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
|
7 |
+
@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
|
8 |
+
@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
|
9 |
+
@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
|
10 |
+
@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
|
11 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
12 |
+
|
13 |
+
declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
|
14 |
+
|
15 |
+
define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
|
16 |
+
%9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
17 |
+
%10 = and i32 %9, 31, !dbg !10
|
18 |
+
%11 = lshr i32 %9, 5, !dbg !10
|
19 |
+
%12 = and i32 %11, 3, !dbg !10
|
20 |
+
%13 = lshr i32 %10, 1, !dbg !10
|
21 |
+
%14 = shl nuw nsw i32 %12, 4, !dbg !10
|
22 |
+
%15 = or i32 %14, %13, !dbg !10
|
23 |
+
%16 = and i32 %9, 63, !dbg !10
|
24 |
+
%17 = shl i32 %9, 2, !dbg !11
|
25 |
+
%18 = and i32 %17, 4, !dbg !11
|
26 |
+
%19 = and i32 %9, 7, !dbg !11
|
27 |
+
%20 = shl nuw nsw i32 %12, 2, !dbg !12
|
28 |
+
%21 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !13
|
29 |
+
%22 = shl i32 %21, 6, !dbg !14
|
30 |
+
%23 = or i32 %22, %15, !dbg !15
|
31 |
+
%24 = or i32 %22, %16, !dbg !15
|
32 |
+
%25 = sext i32 %23 to i64, !dbg !16
|
33 |
+
%26 = getelementptr i64, ptr addrspace(1) %0, i64 %25, !dbg !16
|
34 |
+
%27 = sext i32 %24 to i64, !dbg !16
|
35 |
+
%28 = getelementptr i64, ptr addrspace(1) %0, i64 %27, !dbg !16
|
36 |
+
%29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
|
37 |
+
%30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
|
38 |
+
%31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
|
39 |
+
%32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
|
40 |
+
%33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %28, i1 true) #6, !dbg !17
|
41 |
+
%34 = srem i32 %23, 512, !dbg !18
|
42 |
+
%35 = shl nsw i32 %34, 8, !dbg !19
|
43 |
+
%36 = shl i32 %23, 8, !dbg !20
|
44 |
+
%37 = add i64 %33, 50257, !dbg !21
|
45 |
+
%38 = icmp slt i64 %29, 0, !dbg !22
|
46 |
+
%39 = icmp slt i64 %33, 0, !dbg !22
|
47 |
+
%40 = select i1 %39, i64 %37, i64 %33, !dbg !23
|
48 |
+
%41 = icmp ugt i64 %40, 50256, !dbg !24
|
49 |
+
%42 = shl i64 %29, 8, !dbg !25
|
50 |
+
%43 = add i64 %42, 12865792, !dbg !25
|
51 |
+
%44 = select i1 %38, i64 %43, i64 %42, !dbg !25
|
52 |
+
%45 = getelementptr float, ptr addrspace(1) %1, i64 %44
|
53 |
+
br label %46, !dbg !12
|
54 |
+
|
55 |
+
46: ; preds = %8, %92
|
56 |
+
%47 = phi float [ 0.000000e+00, %8 ], [ %116, %92 ]
|
57 |
+
%48 = phi float [ 0.000000e+00, %8 ], [ %117, %92 ]
|
58 |
+
%49 = phi float [ 0.000000e+00, %8 ], [ %118, %92 ]
|
59 |
+
%50 = phi float [ 0.000000e+00, %8 ], [ %119, %92 ]
|
60 |
+
%51 = phi float [ 0.000000e+00, %8 ], [ %120, %92 ]
|
61 |
+
%52 = phi float [ 0.000000e+00, %8 ], [ %121, %92 ]
|
62 |
+
%53 = phi float [ 0.000000e+00, %8 ], [ %122, %92 ]
|
63 |
+
%54 = phi float [ 0.000000e+00, %8 ], [ %123, %92 ]
|
64 |
+
%55 = phi float [ 0.000000e+00, %8 ], [ %140, %92 ]
|
65 |
+
%56 = phi float [ 0.000000e+00, %8 ], [ %141, %92 ]
|
66 |
+
%57 = phi float [ 0.000000e+00, %8 ], [ %142, %92 ]
|
67 |
+
%58 = phi float [ 0.000000e+00, %8 ], [ %143, %92 ]
|
68 |
+
%59 = phi float [ 0.000000e+00, %8 ], [ %128, %92 ]
|
69 |
+
%60 = phi float [ 0.000000e+00, %8 ], [ %129, %92 ]
|
70 |
+
%61 = phi float [ 0.000000e+00, %8 ], [ %130, %92 ]
|
71 |
+
%62 = phi float [ 0.000000e+00, %8 ], [ %131, %92 ]
|
72 |
+
%63 = phi i32 [ 0, %8 ], [ %144, %92 ]
|
73 |
+
%64 = or i32 %63, %18, !dbg !26
|
74 |
+
%65 = add i32 %64, %35, !dbg !27
|
75 |
+
%66 = sext i32 %65 to i64, !dbg !28
|
76 |
+
%67 = getelementptr float, ptr addrspace(1) %2, i64 %66, !dbg !28
|
77 |
+
%68 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %67, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
|
78 |
+
%69 = extractvalue { i32, i32, i32, i32 } %68, 0, !dbg !29
|
79 |
+
%70 = extractvalue { i32, i32, i32, i32 } %68, 1, !dbg !29
|
80 |
+
%71 = extractvalue { i32, i32, i32, i32 } %68, 2, !dbg !29
|
81 |
+
%72 = extractvalue { i32, i32, i32, i32 } %68, 3, !dbg !29
|
82 |
+
%73 = bitcast i32 %69 to float, !dbg !29
|
83 |
+
%74 = bitcast i32 %70 to float, !dbg !29
|
84 |
+
%75 = bitcast i32 %71 to float, !dbg !29
|
85 |
+
%76 = bitcast i32 %72 to float, !dbg !29
|
86 |
+
%77 = add i32 %64, %36, !dbg !30
|
87 |
+
%78 = sext i32 %77 to i64, !dbg !31
|
88 |
+
%79 = getelementptr i16, ptr addrspace(1) %3, i64 %78, !dbg !31
|
89 |
+
%80 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %79, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
|
90 |
+
%81 = extractvalue { i32, i32 } %80, 0, !dbg !32
|
91 |
+
%82 = extractvalue { i32, i32 } %80, 1, !dbg !32
|
92 |
+
%83 = trunc i32 %81 to i16, !dbg !32
|
93 |
+
%extelt.offset3 = lshr i32 %81, 16, !dbg !32
|
94 |
+
%84 = trunc i32 %extelt.offset3 to i16, !dbg !32
|
95 |
+
%85 = trunc i32 %82 to i16, !dbg !32
|
96 |
+
%extelt.offset4 = lshr i32 %82, 16, !dbg !32
|
97 |
+
%86 = trunc i32 %extelt.offset4 to i16, !dbg !32
|
98 |
+
%87 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %83) #6, !dbg !33
|
99 |
+
%88 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %84) #6, !dbg !33
|
100 |
+
%89 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %85) #6, !dbg !33
|
101 |
+
%90 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %86) #6, !dbg !33
|
102 |
+
br i1 %41, label %91, label %92, !dbg !34
|
103 |
+
|
104 |
+
91: ; preds = %46
|
105 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !34
|
106 |
+
br label %92, !dbg !34
|
107 |
+
|
108 |
+
92: ; preds = %91, %46
|
109 |
+
%93 = zext nneg i32 %64 to i64, !dbg !35
|
110 |
+
%94 = getelementptr float, ptr addrspace(1) %45, i64 %93, !dbg !36
|
111 |
+
%95 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %94, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37
|
112 |
+
%96 = extractvalue { i32, i32, i32, i32 } %95, 0, !dbg !37
|
113 |
+
%97 = extractvalue { i32, i32, i32, i32 } %95, 1, !dbg !37
|
114 |
+
%98 = extractvalue { i32, i32, i32, i32 } %95, 2, !dbg !37
|
115 |
+
%99 = extractvalue { i32, i32, i32, i32 } %95, 3, !dbg !37
|
116 |
+
%100 = bitcast i32 %96 to float, !dbg !37
|
117 |
+
%101 = bitcast i32 %97 to float, !dbg !37
|
118 |
+
%102 = bitcast i32 %98 to float, !dbg !37
|
119 |
+
%103 = bitcast i32 %99 to float, !dbg !37
|
120 |
+
%104 = fadd float %73, %100, !dbg !38
|
121 |
+
%105 = fadd float %74, %101, !dbg !38
|
122 |
+
%106 = fadd float %75, %102, !dbg !38
|
123 |
+
%107 = fadd float %76, %103, !dbg !38
|
124 |
+
%108 = fadd float %87, %104, !dbg !39
|
125 |
+
%109 = fadd float %88, %105, !dbg !39
|
126 |
+
%110 = fadd float %89, %106, !dbg !39
|
127 |
+
%111 = fadd float %90, %107, !dbg !39
|
128 |
+
%112 = fsub float %108, %59, !dbg !40
|
129 |
+
%113 = fsub float %109, %60, !dbg !40
|
130 |
+
%114 = fsub float %110, %61, !dbg !40
|
131 |
+
%115 = fsub float %111, %62, !dbg !40
|
132 |
+
%116 = fadd float %47, 1.000000e+00, !dbg !44
|
133 |
+
%117 = fadd float %48, 1.000000e+00, !dbg !44
|
134 |
+
%118 = fadd float %49, 1.000000e+00, !dbg !44
|
135 |
+
%119 = fadd float %50, 1.000000e+00, !dbg !44
|
136 |
+
%120 = fadd float %51, 1.000000e+00, !dbg !44
|
137 |
+
%121 = fadd float %52, 1.000000e+00, !dbg !44
|
138 |
+
%122 = fadd float %53, 1.000000e+00, !dbg !44
|
139 |
+
%123 = fadd float %54, 1.000000e+00, !dbg !44
|
140 |
+
%124 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %112, float %116) #6, !dbg !45
|
141 |
+
%125 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %113, float %117) #6, !dbg !45
|
142 |
+
%126 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %114, float %118) #6, !dbg !45
|
143 |
+
%127 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %115, float %119) #6, !dbg !45
|
144 |
+
%128 = fadd float %59, %124, !dbg !46
|
145 |
+
%129 = fadd float %60, %125, !dbg !46
|
146 |
+
%130 = fadd float %61, %126, !dbg !46
|
147 |
+
%131 = fadd float %62, %127, !dbg !46
|
148 |
+
%132 = fsub float %108, %128, !dbg !47
|
149 |
+
%133 = fsub float %109, %129, !dbg !47
|
150 |
+
%134 = fsub float %110, %130, !dbg !47
|
151 |
+
%135 = fsub float %111, %131, !dbg !47
|
152 |
+
%136 = fmul float %112, %132, !dbg !48
|
153 |
+
%137 = fmul float %113, %133, !dbg !48
|
154 |
+
%138 = fmul float %114, %134, !dbg !48
|
155 |
+
%139 = fmul float %115, %135, !dbg !48
|
156 |
+
%140 = fadd float %55, %136, !dbg !49
|
157 |
+
%141 = fadd float %56, %137, !dbg !49
|
158 |
+
%142 = fadd float %57, %138, !dbg !49
|
159 |
+
%143 = fadd float %58, %139, !dbg !49
|
160 |
+
%144 = add nuw nsw i32 %63, 8, !dbg !12
|
161 |
+
%145 = icmp ult i32 %63, 248, !dbg !12
|
162 |
+
br i1 %145, label %46, label %146, !dbg !12
|
163 |
+
|
164 |
+
146: ; preds = %92
|
165 |
+
%147 = lshr i32 %10, 3, !dbg !12
|
166 |
+
%148 = or i32 %20, %147, !dbg !12
|
167 |
+
%149 = mul nuw nsw i32 %148, 12, !dbg !12
|
168 |
+
%150 = add nuw nsw i32 %149, %19, !dbg !12
|
169 |
+
%151 = zext nneg i32 %150 to i64, !dbg !12
|
170 |
+
%152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !12
|
171 |
+
%153 = insertelement <1 x float> undef, float %120, i64 0, !dbg !12
|
172 |
+
store <1 x float> %153, ptr addrspace(3) %152, align 4, !dbg !12
|
173 |
+
%154 = or i32 %19, 192, !dbg !12
|
174 |
+
%155 = add nuw nsw i32 %154, %149, !dbg !12
|
175 |
+
%156 = zext nneg i32 %155 to i64, !dbg !12
|
176 |
+
%157 = getelementptr float, ptr addrspace(3) @global_smem, i64 %156, !dbg !12
|
177 |
+
%158 = insertelement <1 x float> undef, float %121, i64 0, !dbg !12
|
178 |
+
store <1 x float> %158, ptr addrspace(3) %157, align 4, !dbg !12
|
179 |
+
%159 = or i32 %19, 384, !dbg !12
|
180 |
+
%160 = add nuw nsw i32 %159, %149, !dbg !12
|
181 |
+
%161 = zext nneg i32 %160 to i64, !dbg !12
|
182 |
+
%162 = getelementptr float, ptr addrspace(3) @global_smem, i64 %161, !dbg !12
|
183 |
+
%163 = insertelement <1 x float> undef, float %122, i64 0, !dbg !12
|
184 |
+
store <1 x float> %163, ptr addrspace(3) %162, align 4, !dbg !12
|
185 |
+
%164 = or i32 %19, 576, !dbg !12
|
186 |
+
%165 = add nuw nsw i32 %164, %149, !dbg !12
|
187 |
+
%166 = zext nneg i32 %165 to i64, !dbg !12
|
188 |
+
%167 = getelementptr float, ptr addrspace(3) @global_smem, i64 %166, !dbg !12
|
189 |
+
%168 = insertelement <1 x float> undef, float %123, i64 0, !dbg !12
|
190 |
+
store <1 x float> %168, ptr addrspace(3) %167, align 4, !dbg !12
|
191 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !12
|
192 |
+
%169 = mul nuw nsw i32 %15, 12, !dbg !12
|
193 |
+
%170 = add nuw nsw i32 %169, %18, !dbg !12
|
194 |
+
%171 = zext nneg i32 %170 to i64, !dbg !12
|
195 |
+
%172 = getelementptr float, ptr addrspace(3) @global_smem, i64 %171, !dbg !12
|
196 |
+
%173 = load float, ptr addrspace(3) %172, align 16, !dbg !12
|
197 |
+
%174 = getelementptr inbounds <4 x float>, ptr addrspace(3) %172, i64 0, i64 1, !dbg !12
|
198 |
+
%175 = load float, ptr addrspace(3) %174, align 4, !dbg !12
|
199 |
+
%176 = getelementptr inbounds <4 x float>, ptr addrspace(3) %172, i64 0, i64 2, !dbg !12
|
200 |
+
%177 = load float, ptr addrspace(3) %176, align 8, !dbg !12
|
201 |
+
%178 = getelementptr inbounds <4 x float>, ptr addrspace(3) %172, i64 0, i64 3, !dbg !12
|
202 |
+
%179 = load float, ptr addrspace(3) %178, align 4, !dbg !12
|
203 |
+
%180 = fsub float %129, %128, !dbg !50
|
204 |
+
%181 = fadd float %173, %175, !dbg !54
|
205 |
+
%182 = fcmp oeq float %181, 0.000000e+00, !dbg !55
|
206 |
+
%183 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %175, float %181) #6, !dbg !56
|
207 |
+
%184 = select i1 %182, float 0.000000e+00, float %183, !dbg !57
|
208 |
+
%185 = fmul float %180, %184, !dbg !58
|
209 |
+
%186 = fadd float %128, %185, !dbg !59
|
210 |
+
%187 = fadd float %140, %141, !dbg !60
|
211 |
+
%188 = fmul float %180, %180, !dbg !61
|
212 |
+
%189 = fmul float %188, %173, !dbg !62
|
213 |
+
%190 = fmul float %189, %184, !dbg !63
|
214 |
+
%191 = fadd float %187, %190, !dbg !64
|
215 |
+
%192 = fsub float %130, %186, !dbg !50
|
216 |
+
%193 = fadd float %177, %181, !dbg !54
|
217 |
+
%194 = fcmp oeq float %193, 0.000000e+00, !dbg !55
|
218 |
+
%195 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %177, float %193) #6, !dbg !56
|
219 |
+
%196 = select i1 %194, float 0.000000e+00, float %195, !dbg !57
|
220 |
+
%197 = fmul float %196, %192, !dbg !58
|
221 |
+
%198 = fadd float %186, %197, !dbg !59
|
222 |
+
%199 = fadd float %142, %191, !dbg !60
|
223 |
+
%200 = fmul float %192, %192, !dbg !61
|
224 |
+
%201 = fmul float %181, %200, !dbg !62
|
225 |
+
%202 = fmul float %196, %201, !dbg !63
|
226 |
+
%203 = fadd float %199, %202, !dbg !64
|
227 |
+
%204 = fsub float %131, %198, !dbg !50
|
228 |
+
%205 = fadd float %179, %193, !dbg !54
|
229 |
+
%206 = fcmp oeq float %205, 0.000000e+00, !dbg !55
|
230 |
+
%207 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %179, float %205) #6, !dbg !56
|
231 |
+
%208 = select i1 %206, float 0.000000e+00, float %207, !dbg !57
|
232 |
+
%209 = fmul float %208, %204, !dbg !58
|
233 |
+
%210 = fadd float %198, %209, !dbg !59
|
234 |
+
%211 = fadd float %143, %203, !dbg !60
|
235 |
+
%212 = fmul float %204, %204, !dbg !61
|
236 |
+
%213 = fmul float %193, %212, !dbg !62
|
237 |
+
%214 = fmul float %208, %213, !dbg !63
|
238 |
+
%215 = fadd float %211, %214, !dbg !64
|
239 |
+
%216 = bitcast float %210 to i32, !dbg !65
|
240 |
+
%217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 1, i32 31), !dbg !65
|
241 |
+
%218 = bitcast i32 %217 to float, !dbg !65
|
242 |
+
%219 = bitcast float %215 to i32, !dbg !65
|
243 |
+
%220 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %219, i32 1, i32 31), !dbg !65
|
244 |
+
%221 = bitcast i32 %220 to float, !dbg !65
|
245 |
+
%222 = bitcast float %205 to i32, !dbg !65
|
246 |
+
%223 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %222, i32 1, i32 31), !dbg !65
|
247 |
+
%224 = bitcast i32 %223 to float, !dbg !65
|
248 |
+
%225 = fsub float %218, %210, !dbg !50
|
249 |
+
%226 = fadd float %205, %224, !dbg !54
|
250 |
+
%227 = fcmp oeq float %226, 0.000000e+00, !dbg !55
|
251 |
+
%228 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %224, float %226) #6, !dbg !56
|
252 |
+
%229 = select i1 %227, float 0.000000e+00, float %228, !dbg !57
|
253 |
+
%230 = fmul float %229, %225, !dbg !58
|
254 |
+
%231 = fadd float %210, %230, !dbg !59
|
255 |
+
%232 = fadd float %215, %221, !dbg !60
|
256 |
+
%233 = fmul float %225, %225, !dbg !61
|
257 |
+
%234 = fmul float %205, %233, !dbg !62
|
258 |
+
%235 = fmul float %229, %234, !dbg !63
|
259 |
+
%236 = fadd float %232, %235, !dbg !64
|
260 |
+
%237 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %236, float 2.560000e+02) #6, !dbg !67
|
261 |
+
%238 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %236, float 2.560000e+02) #6, !dbg !67
|
262 |
+
%239 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %236, float 2.560000e+02) #6, !dbg !67
|
263 |
+
%240 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %236, float 2.560000e+02) #6, !dbg !67
|
264 |
+
%241 = fadd float %237, 0x3EE4F8B580000000, !dbg !68
|
265 |
+
br label %242, !dbg !69
|
266 |
+
|
267 |
+
242: ; preds = %146, %__nv_rsqrtf.exit
|
268 |
+
%243 = phi i32 [ 0, %146 ], [ %333, %__nv_rsqrtf.exit ]
|
269 |
+
%244 = or i32 %243, %18, !dbg !70
|
270 |
+
%245 = add i32 %244, %35, !dbg !71
|
271 |
+
%246 = sext i32 %245 to i64, !dbg !72
|
272 |
+
%247 = getelementptr float, ptr addrspace(1) %2, i64 %246, !dbg !72
|
273 |
+
%248 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %247, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
|
274 |
+
%249 = extractvalue { i32, i32, i32, i32 } %248, 0, !dbg !73
|
275 |
+
%250 = extractvalue { i32, i32, i32, i32 } %248, 1, !dbg !73
|
276 |
+
%251 = extractvalue { i32, i32, i32, i32 } %248, 2, !dbg !73
|
277 |
+
%252 = extractvalue { i32, i32, i32, i32 } %248, 3, !dbg !73
|
278 |
+
%253 = bitcast i32 %249 to float, !dbg !73
|
279 |
+
%254 = bitcast i32 %250 to float, !dbg !73
|
280 |
+
%255 = bitcast i32 %251 to float, !dbg !73
|
281 |
+
%256 = bitcast i32 %252 to float, !dbg !73
|
282 |
+
%257 = add i32 %244, %36, !dbg !74
|
283 |
+
%258 = sext i32 %257 to i64, !dbg !75
|
284 |
+
%259 = getelementptr i16, ptr addrspace(1) %3, i64 %258, !dbg !75
|
285 |
+
%260 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %259, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !76
|
286 |
+
%261 = extractvalue { i32, i32 } %260, 0, !dbg !76
|
287 |
+
%262 = extractvalue { i32, i32 } %260, 1, !dbg !76
|
288 |
+
%263 = trunc i32 %261 to i16, !dbg !76
|
289 |
+
%extelt.offset = lshr i32 %261, 16, !dbg !76
|
290 |
+
%264 = trunc i32 %extelt.offset to i16, !dbg !76
|
291 |
+
%265 = trunc i32 %262 to i16, !dbg !76
|
292 |
+
%extelt.offset2 = lshr i32 %262, 16, !dbg !76
|
293 |
+
%266 = trunc i32 %extelt.offset2 to i16, !dbg !76
|
294 |
+
%267 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %263) #6, !dbg !77
|
295 |
+
%268 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %264) #6, !dbg !77
|
296 |
+
%269 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %265) #6, !dbg !77
|
297 |
+
%270 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %266) #6, !dbg !77
|
298 |
+
%271 = zext nneg i32 %244 to i64, !dbg !78
|
299 |
+
%272 = getelementptr float, ptr addrspace(1) %4, i64 %271, !dbg !78
|
300 |
+
%273 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %272, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !79
|
301 |
+
%274 = extractvalue { i32, i32, i32, i32 } %273, 0, !dbg !79
|
302 |
+
%275 = extractvalue { i32, i32, i32, i32 } %273, 1, !dbg !79
|
303 |
+
%276 = extractvalue { i32, i32, i32, i32 } %273, 2, !dbg !79
|
304 |
+
%277 = extractvalue { i32, i32, i32, i32 } %273, 3, !dbg !79
|
305 |
+
%278 = bitcast i32 %274 to float, !dbg !79
|
306 |
+
%279 = bitcast i32 %275 to float, !dbg !79
|
307 |
+
%280 = bitcast i32 %276 to float, !dbg !79
|
308 |
+
%281 = bitcast i32 %277 to float, !dbg !79
|
309 |
+
br i1 %41, label %282, label %283, !dbg !80
|
310 |
+
|
311 |
+
282: ; preds = %242
|
312 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !80
|
313 |
+
br label %283, !dbg !80
|
314 |
+
|
315 |
+
283: ; preds = %282, %242
|
316 |
+
%284 = getelementptr float, ptr addrspace(1) %45, i64 %271, !dbg !81
|
317 |
+
%285 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %284, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82
|
318 |
+
%286 = extractvalue { i32, i32, i32, i32 } %285, 0, !dbg !82
|
319 |
+
%287 = extractvalue { i32, i32, i32, i32 } %285, 1, !dbg !82
|
320 |
+
%288 = extractvalue { i32, i32, i32, i32 } %285, 2, !dbg !82
|
321 |
+
%289 = extractvalue { i32, i32, i32, i32 } %285, 3, !dbg !82
|
322 |
+
%290 = bitcast i32 %286 to float, !dbg !82
|
323 |
+
%291 = bitcast i32 %287 to float, !dbg !82
|
324 |
+
%292 = bitcast i32 %288 to float, !dbg !82
|
325 |
+
%293 = bitcast i32 %289 to float, !dbg !82
|
326 |
+
%294 = fadd float %253, %290, !dbg !83
|
327 |
+
%295 = fadd float %254, %291, !dbg !83
|
328 |
+
%296 = fadd float %255, %292, !dbg !83
|
329 |
+
%297 = fadd float %256, %293, !dbg !83
|
330 |
+
%298 = fadd float %267, %294, !dbg !84
|
331 |
+
%299 = fadd float %268, %295, !dbg !84
|
332 |
+
%300 = fadd float %269, %296, !dbg !84
|
333 |
+
%301 = fadd float %270, %297, !dbg !84
|
334 |
+
%302 = fsub float %298, %231, !dbg !85
|
335 |
+
%303 = fsub float %299, %231, !dbg !85
|
336 |
+
%304 = fsub float %300, %231, !dbg !85
|
337 |
+
%305 = fsub float %301, %231, !dbg !85
|
338 |
+
%306 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
339 |
+
%.not.i = icmp eq i32 %306, 0, !dbg !86
|
340 |
+
br i1 %.not.i, label %309, label %307, !dbg !86
|
341 |
+
|
342 |
+
307: ; preds = %283
|
343 |
+
%308 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %241), !dbg !86
|
344 |
+
br label %__nv_rsqrtf.exit, !dbg !86
|
345 |
+
|
346 |
+
309: ; preds = %283
|
347 |
+
%310 = tail call float @llvm.nvvm.rsqrt.approx.f(float %241), !dbg !86
|
348 |
+
br label %__nv_rsqrtf.exit, !dbg !86
|
349 |
+
|
350 |
+
__nv_rsqrtf.exit: ; preds = %307, %309
|
351 |
+
%.0.i = phi float [ %308, %307 ], [ %310, %309 ], !dbg !86
|
352 |
+
%311 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
353 |
+
%312 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
354 |
+
%313 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
355 |
+
%314 = fmul float %302, %.0.i, !dbg !87
|
356 |
+
%315 = fmul float %303, %.0.i, !dbg !87
|
357 |
+
%316 = fmul float %304, %.0.i, !dbg !87
|
358 |
+
%317 = fmul float %305, %.0.i, !dbg !87
|
359 |
+
%318 = fmul float %314, %278, !dbg !88
|
360 |
+
%319 = fmul float %315, %279, !dbg !88
|
361 |
+
%320 = fmul float %316, %280, !dbg !88
|
362 |
+
%321 = fmul float %317, %281, !dbg !88
|
363 |
+
%322 = getelementptr i16, ptr addrspace(1) %5, i64 %258, !dbg !89
|
364 |
+
%323 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %318) #6, !dbg !90
|
365 |
+
%324 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %319) #6, !dbg !90
|
366 |
+
%325 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %320) #6, !dbg !90
|
367 |
+
%326 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %321) #6, !dbg !90
|
368 |
+
%327 = insertelement <2 x i16> undef, i16 %323, i64 0, !dbg !90
|
369 |
+
%328 = insertelement <2 x i16> %327, i16 %324, i64 1, !dbg !90
|
370 |
+
%329 = bitcast <2 x i16> %328 to i32, !dbg !90
|
371 |
+
%330 = insertelement <2 x i16> undef, i16 %325, i64 0, !dbg !90
|
372 |
+
%331 = insertelement <2 x i16> %330, i16 %326, i64 1, !dbg !90
|
373 |
+
%332 = bitcast <2 x i16> %331 to i32, !dbg !90
|
374 |
+
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %329, i32 %332, ptr addrspace(1) %322, i1 true) #6, !dbg !90
|
375 |
+
%333 = add nuw nsw i32 %243, 8, !dbg !69
|
376 |
+
%334 = icmp ult i32 %243, 248, !dbg !69
|
377 |
+
br i1 %334, label %242, label %335, !dbg !69
|
378 |
+
|
379 |
+
335: ; preds = %__nv_rsqrtf.exit
|
380 |
+
ret void, !dbg !91
|
381 |
+
}
|
382 |
+
|
383 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
384 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
385 |
+
|
386 |
+
; Function Attrs: convergent nocallback nounwind
|
387 |
+
declare void @llvm.nvvm.barrier0() #1
|
388 |
+
|
389 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
390 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
|
391 |
+
|
392 |
+
; Function Attrs: alwaysinline nounwind
|
393 |
+
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
|
394 |
+
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
|
395 |
+
%.not = icmp eq i32 %1, 0
|
396 |
+
br i1 %.not, label %4, label %2
|
397 |
+
|
398 |
+
2: ; preds = %0
|
399 |
+
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
|
400 |
+
br label %6
|
401 |
+
|
402 |
+
4: ; preds = %0
|
403 |
+
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
|
404 |
+
br label %6
|
405 |
+
|
406 |
+
6: ; preds = %4, %2
|
407 |
+
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
|
408 |
+
ret float %.0
|
409 |
+
}
|
410 |
+
|
411 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
|
412 |
+
|
413 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
414 |
+
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
|
415 |
+
|
416 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
417 |
+
declare float @llvm.nvvm.rsqrt.approx.f(float) #5
|
418 |
+
|
419 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
420 |
+
attributes #1 = { convergent nocallback nounwind }
|
421 |
+
attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
422 |
+
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
423 |
+
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
424 |
+
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
425 |
+
attributes #6 = { nounwind }
|
426 |
+
|
427 |
+
!llvm.module.flags = !{!0, !1}
|
428 |
+
!llvm.dbg.cu = !{!2}
|
429 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
430 |
+
!llvm.ident = !{!6}
|
431 |
+
|
432 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
433 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
434 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
435 |
+
!3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn")
|
436 |
+
!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
|
437 |
+
!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 128}
|
438 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
439 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
440 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
441 |
+
!9 = !{}
|
442 |
+
!10 = !DILocation(line: 22, column: 44, scope: !7)
|
443 |
+
!11 = !DILocation(line: 24, column: 33, scope: !7)
|
444 |
+
!12 = !DILocation(line: 31, column: 36, scope: !7)
|
445 |
+
!13 = !DILocation(line: 21, column: 28, scope: !7)
|
446 |
+
!14 = !DILocation(line: 21, column: 33, scope: !7)
|
447 |
+
!15 = !DILocation(line: 22, column: 23, scope: !7)
|
448 |
+
!16 = !DILocation(line: 26, column: 30, scope: !7)
|
449 |
+
!17 = !DILocation(line: 26, column: 35, scope: !7)
|
450 |
+
!18 = !DILocation(line: 27, column: 18, scope: !7)
|
451 |
+
!19 = !DILocation(line: 35, column: 44, scope: !7)
|
452 |
+
!20 = !DILocation(line: 36, column: 44, scope: !7)
|
453 |
+
!21 = !DILocation(line: 37, column: 22, scope: !7)
|
454 |
+
!22 = !DILocation(line: 38, column: 22, scope: !7)
|
455 |
+
!23 = !DILocation(line: 39, column: 36, scope: !7)
|
456 |
+
!24 = !DILocation(line: 40, column: 40, scope: !7)
|
457 |
+
!25 = !DILocation(line: 41, column: 44, scope: !7)
|
458 |
+
!26 = !DILocation(line: 32, column: 27, scope: !7)
|
459 |
+
!27 = !DILocation(line: 35, column: 40, scope: !7)
|
460 |
+
!28 = !DILocation(line: 35, column: 34, scope: !7)
|
461 |
+
!29 = !DILocation(line: 35, column: 50, scope: !7)
|
462 |
+
!30 = !DILocation(line: 36, column: 40, scope: !7)
|
463 |
+
!31 = !DILocation(line: 36, column: 34, scope: !7)
|
464 |
+
!32 = !DILocation(line: 36, column: 50, scope: !7)
|
465 |
+
!33 = !DILocation(line: 36, column: 101, scope: !7)
|
466 |
+
!34 = !DILocation(line: 40, column: 55, scope: !7)
|
467 |
+
!35 = !DILocation(line: 41, column: 40, scope: !7)
|
468 |
+
!36 = !DILocation(line: 41, column: 34, scope: !7)
|
469 |
+
!37 = !DILocation(line: 41, column: 52, scope: !7)
|
470 |
+
!38 = !DILocation(line: 42, column: 22, scope: !7)
|
471 |
+
!39 = !DILocation(line: 44, column: 22, scope: !7)
|
472 |
+
!40 = !DILocation(line: 96, column: 20, scope: !41, inlinedAt: !43)
|
473 |
+
!41 = distinct !DILexicalBlockFile(scope: !7, file: !42, discriminator: 0)
|
474 |
+
!42 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
475 |
+
!43 = !DILocation(line: 47, column: 41, scope: !41)
|
476 |
+
!44 = !DILocation(line: 97, column: 26, scope: !41, inlinedAt: !43)
|
477 |
+
!45 = !DILocation(line: 98, column: 30, scope: !41, inlinedAt: !43)
|
478 |
+
!46 = !DILocation(line: 98, column: 22, scope: !41, inlinedAt: !43)
|
479 |
+
!47 = !DILocation(line: 101, column: 30, scope: !41, inlinedAt: !43)
|
480 |
+
!48 = !DILocation(line: 101, column: 22, scope: !41, inlinedAt: !43)
|
481 |
+
!49 = !DILocation(line: 50, column: 50, scope: !7)
|
482 |
+
!50 = !DILocation(line: 108, column: 21, scope: !51, inlinedAt: !52)
|
483 |
+
!51 = distinct !DILexicalBlockFile(scope: !41, file: !42, discriminator: 0)
|
484 |
+
!52 = !DILocation(line: 120, column: 46, scope: !51, inlinedAt: !53)
|
485 |
+
!53 = !DILocation(line: 53, column: 44, scope: !51)
|
486 |
+
!54 = !DILocation(line: 109, column: 28, scope: !51, inlinedAt: !52)
|
487 |
+
!55 = !DILocation(line: 110, column: 39, scope: !51, inlinedAt: !52)
|
488 |
+
!56 = !DILocation(line: 110, column: 60, scope: !51, inlinedAt: !52)
|
489 |
+
!57 = !DILocation(line: 110, column: 49, scope: !51, inlinedAt: !52)
|
490 |
+
!58 = !DILocation(line: 112, column: 25, scope: !51, inlinedAt: !52)
|
491 |
+
!59 = !DILocation(line: 112, column: 17, scope: !51, inlinedAt: !52)
|
492 |
+
!60 = !DILocation(line: 113, column: 15, scope: !51, inlinedAt: !52)
|
493 |
+
!61 = !DILocation(line: 113, column: 30, scope: !51, inlinedAt: !52)
|
494 |
+
!62 = !DILocation(line: 113, column: 38, scope: !51, inlinedAt: !52)
|
495 |
+
!63 = !DILocation(line: 113, column: 49, scope: !51, inlinedAt: !52)
|
496 |
+
!64 = !DILocation(line: 113, column: 22, scope: !51, inlinedAt: !52)
|
497 |
+
!65 = !DILocation(line: 120, column: 46, scope: !41, inlinedAt: !66)
|
498 |
+
!66 = !DILocation(line: 53, column: 44, scope: !41)
|
499 |
+
!67 = !DILocation(line: 75, column: 24, scope: !7)
|
500 |
+
!68 = !DILocation(line: 77, column: 24, scope: !7)
|
501 |
+
!69 = !DILocation(line: 58, column: 36, scope: !7)
|
502 |
+
!70 = !DILocation(line: 59, column: 27, scope: !7)
|
503 |
+
!71 = !DILocation(line: 62, column: 41, scope: !7)
|
504 |
+
!72 = !DILocation(line: 62, column: 35, scope: !7)
|
505 |
+
!73 = !DILocation(line: 62, column: 51, scope: !7)
|
506 |
+
!74 = !DILocation(line: 63, column: 41, scope: !7)
|
507 |
+
!75 = !DILocation(line: 63, column: 35, scope: !7)
|
508 |
+
!76 = !DILocation(line: 63, column: 51, scope: !7)
|
509 |
+
!77 = !DILocation(line: 63, column: 103, scope: !7)
|
510 |
+
!78 = !DILocation(line: 64, column: 35, scope: !7)
|
511 |
+
!79 = !DILocation(line: 64, column: 40, scope: !7)
|
512 |
+
!80 = !DILocation(line: 68, column: 57, scope: !7)
|
513 |
+
!81 = !DILocation(line: 69, column: 35, scope: !7)
|
514 |
+
!82 = !DILocation(line: 69, column: 54, scope: !7)
|
515 |
+
!83 = !DILocation(line: 70, column: 24, scope: !7)
|
516 |
+
!84 = !DILocation(line: 72, column: 24, scope: !7)
|
517 |
+
!85 = !DILocation(line: 73, column: 24, scope: !7)
|
518 |
+
!86 = !DILocation(line: 78, column: 30, scope: !7)
|
519 |
+
!87 = !DILocation(line: 79, column: 24, scope: !7)
|
520 |
+
!88 = !DILocation(line: 80, column: 24, scope: !7)
|
521 |
+
!89 = !DILocation(line: 82, column: 29, scope: !7)
|
522 |
+
!90 = !DILocation(line: 82, column: 52, scope: !7)
|
523 |
+
!91 = !DILocation(line: 58, column: 4, scope: !7)
|
.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ttgir
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
4 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
5 |
+
tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
6 |
+
%cst = arith.constant dense<512> : tensor<64x1xi32, #blocked>
|
7 |
+
%cst_0 = arith.constant dense<256> : tensor<1x8xi32, #blocked>
|
8 |
+
%cst_1 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
|
9 |
+
%cst_2 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked>
|
10 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<1x8xf32, #blocked>
|
11 |
+
%cst_4 = arith.constant dense<1.000000e+00> : tensor<64x8xf32, #blocked>
|
12 |
+
%cst_5 = arith.constant dense<256> : tensor<64x1xi64, #blocked>
|
13 |
+
%cst_6 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
|
14 |
+
%cst_7 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
|
15 |
+
%cst_8 = arith.constant dense<50257> : tensor<64x1xi64, #blocked1>
|
16 |
+
%cst_9 = arith.constant dense<0> : tensor<64x1xi64, #blocked1>
|
17 |
+
%c0_i32 = arith.constant 0 : i32
|
18 |
+
%c8_i32 = arith.constant 8 : i32
|
19 |
+
%c256_i32 = arith.constant 256 : i32
|
20 |
+
%cst_10 = arith.constant dense<1.000000e+00> : tensor<64x8xf32, #blocked2>
|
21 |
+
%cst_11 = arith.constant 0.000000e+00 : f32
|
22 |
+
%cst_12 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked2>
|
23 |
+
%cst_13 = arith.constant dense<256> : tensor<1x8xi32, #blocked2>
|
24 |
+
%cst_14 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32, #blocked>
|
25 |
+
%cst_15 = arith.constant dense<2.560000e+02> : tensor<64x1xf32, #blocked>
|
26 |
+
%cst_16 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked>
|
27 |
+
%c64_i32 = arith.constant 64 : i32
|
28 |
+
%0 = tt.get_program_id x : i32
|
29 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
30 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
31 |
+
%3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
32 |
+
%4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
|
33 |
+
%5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
|
34 |
+
%6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
|
35 |
+
%7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
|
36 |
+
%8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
|
37 |
+
%9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
|
38 |
+
%10 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
39 |
+
%11 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
|
40 |
+
%12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked>
|
41 |
+
%13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x8xi32, #blocked2>
|
42 |
+
%14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
|
43 |
+
%15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked1>
|
44 |
+
%16 = tt.addptr %14, %8 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
|
45 |
+
%17 = tt.addptr %15, %9 : tensor<64x1x!tt.ptr<i64, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
|
46 |
+
%18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
|
47 |
+
%19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked1>
|
48 |
+
%20 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
|
49 |
+
%21 = arith.muli %20, %cst_1 : tensor<64x1xi32, #blocked>
|
50 |
+
%22 = tt.broadcast %21 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
51 |
+
%23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
|
52 |
+
%24 = arith.muli %8, %cst_1 : tensor<64x1xi32, #blocked>
|
53 |
+
%25 = tt.broadcast %24 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
54 |
+
%26 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
|
55 |
+
%27 = arith.addi %18, %cst_7 : tensor<64x1xi64, #blocked>
|
56 |
+
%28 = arith.addi %19, %cst_8 : tensor<64x1xi64, #blocked1>
|
57 |
+
%29 = arith.cmpi slt, %18, %cst_6 : tensor<64x1xi64, #blocked>
|
58 |
+
%30 = arith.cmpi slt, %19, %cst_9 : tensor<64x1xi64, #blocked1>
|
59 |
+
%31 = arith.select %29, %27, %18 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
|
60 |
+
%32 = arith.select %30, %28, %19 : tensor<64x1xi1, #blocked1>, tensor<64x1xi64, #blocked1>
|
61 |
+
%33 = arith.cmpi sge, %32, %cst_9 : tensor<64x1xi64, #blocked1>
|
62 |
+
%34 = arith.cmpi slt, %32, %cst_8 : tensor<64x1xi64, #blocked1>
|
63 |
+
%35 = arith.andi %33, %34 : tensor<64x1xi1, #blocked1>
|
64 |
+
%36 = arith.muli %31, %cst_5 : tensor<64x1xi64, #blocked>
|
65 |
+
%37 = tt.broadcast %36 : (tensor<64x1xi64, #blocked>) -> tensor<64x8xi64, #blocked>
|
66 |
+
%38 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
|
67 |
+
%39:4 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c8_i32 iter_args(%arg9 = %cst_2, %arg10 = %cst_2, %arg11 = %cst_12, %arg12 = %cst_2) -> (tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked2>, tensor<64x8xf32, #blocked>) : i32 {
|
68 |
+
%49 = tt.splat %arg8 : (i32) -> tensor<1x8xi32, #blocked>
|
69 |
+
%50 = tt.splat %arg8 : (i32) -> tensor<1x8xi32, #blocked2>
|
70 |
+
%51 = arith.addi %49, %12 : tensor<1x8xi32, #blocked>
|
71 |
+
%52 = arith.addi %50, %13 : tensor<1x8xi32, #blocked2>
|
72 |
+
%53 = arith.cmpi slt, %51, %cst_0 : tensor<1x8xi32, #blocked>
|
73 |
+
%54 = arith.cmpi slt, %52, %cst_13 : tensor<1x8xi32, #blocked2>
|
74 |
+
%55 = tt.broadcast %51 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
75 |
+
%56 = arith.addi %55, %22 : tensor<64x8xi32, #blocked>
|
76 |
+
%57 = tt.addptr %23, %56 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
|
77 |
+
%58 = tt.broadcast %53 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
|
78 |
+
%59 = tt.broadcast %54 : (tensor<1x8xi1, #blocked2>) -> tensor<64x8xi1, #blocked2>
|
79 |
+
%60 = tt.load %57, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
|
80 |
+
%61 = arith.addi %55, %25 : tensor<64x8xi32, #blocked>
|
81 |
+
%62 = tt.addptr %26, %61 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
|
82 |
+
%63 = tt.load %62, %58, %cst_16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
|
83 |
+
%64 = arith.extf %63 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked>
|
84 |
+
tt.assert %35, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
|
85 |
+
%65 = arith.extsi %51 : tensor<1x8xi32, #blocked> to tensor<1x8xi64, #blocked>
|
86 |
+
%66 = tt.broadcast %65 : (tensor<1x8xi64, #blocked>) -> tensor<64x8xi64, #blocked>
|
87 |
+
%67 = arith.addi %66, %37 : tensor<64x8xi64, #blocked>
|
88 |
+
%68 = tt.addptr %38, %67 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi64, #blocked>
|
89 |
+
%69 = tt.load %68, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
|
90 |
+
%70 = arith.addf %69, %60 : tensor<64x8xf32, #blocked>
|
91 |
+
%71 = arith.addf %70, %64 : tensor<64x8xf32, #blocked>
|
92 |
+
%72 = arith.subf %71, %arg9 : tensor<64x8xf32, #blocked>
|
93 |
+
%73 = arith.addf %arg12, %cst_4 : tensor<64x8xf32, #blocked>
|
94 |
+
%74 = arith.addf %arg11, %cst_10 : tensor<64x8xf32, #blocked2>
|
95 |
+
%75 = arith.divf %72, %73 : tensor<64x8xf32, #blocked>
|
96 |
+
%76 = arith.addf %arg9, %75 : tensor<64x8xf32, #blocked>
|
97 |
+
%77 = arith.subf %71, %76 : tensor<64x8xf32, #blocked>
|
98 |
+
%78 = arith.mulf %72, %77 : tensor<64x8xf32, #blocked>
|
99 |
+
%79 = arith.addf %arg10, %78 : tensor<64x8xf32, #blocked>
|
100 |
+
%80 = arith.select %58, %76, %arg9 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
|
101 |
+
%81 = arith.select %58, %79, %arg10 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
|
102 |
+
%82 = arith.select %58, %73, %arg12 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
|
103 |
+
%83 = arith.select %59, %74, %arg11 : tensor<64x8xi1, #blocked2>, tensor<64x8xf32, #blocked2>
|
104 |
+
scf.yield %80, %81, %83, %82 : tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked2>, tensor<64x8xf32, #blocked>
|
105 |
+
}
|
106 |
+
%40 = triton_gpu.convert_layout %39#2 : (tensor<64x8xf32, #blocked2>) -> tensor<64x8xf32, #blocked>
|
107 |
+
%41:3 = "tt.reduce"(%39#0, %39#1, %40) <{axis = 1 : i32}> ({
|
108 |
+
^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
|
109 |
+
%49 = arith.subf %arg11, %arg8 : f32
|
110 |
+
%50 = arith.addf %arg10, %arg13 : f32
|
111 |
+
%51 = arith.cmpf oeq, %50, %cst_11 : f32
|
112 |
+
%52 = arith.divf %arg13, %50 : f32
|
113 |
+
%53 = arith.select %51, %cst_11, %52 : f32
|
114 |
+
%54 = arith.mulf %49, %53 : f32
|
115 |
+
%55 = arith.addf %arg8, %54 : f32
|
116 |
+
%56 = arith.addf %arg9, %arg12 : f32
|
117 |
+
%57 = arith.mulf %49, %49 : f32
|
118 |
+
%58 = arith.mulf %57, %arg10 : f32
|
119 |
+
%59 = arith.mulf %58, %53 : f32
|
120 |
+
%60 = arith.addf %56, %59 : f32
|
121 |
+
tt.reduce.return %55, %60, %50 : f32, f32, f32
|
122 |
+
}) : (tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
|
123 |
+
%42 = tt.expand_dims %41#0 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
|
124 |
+
%43 = tt.expand_dims %41#1 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
|
125 |
+
%44 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x8x!tt.ptr<f32, 1>, #blocked>
|
126 |
+
%45 = tt.broadcast %42 : (tensor<64x1xf32, #blocked>) -> tensor<64x8xf32, #blocked>
|
127 |
+
%46 = arith.divf %43, %cst_15 : tensor<64x1xf32, #blocked>
|
128 |
+
%47 = arith.addf %46, %cst_14 : tensor<64x1xf32, #blocked>
|
129 |
+
%48 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
|
130 |
+
scf.for %arg8 = %c0_i32 to %c256_i32 step %c8_i32 : i32 {
|
131 |
+
%49 = tt.splat %arg8 : (i32) -> tensor<1x8xi32, #blocked>
|
132 |
+
%50 = arith.addi %49, %12 : tensor<1x8xi32, #blocked>
|
133 |
+
%51 = arith.cmpi slt, %50, %cst_0 : tensor<1x8xi32, #blocked>
|
134 |
+
%52 = tt.broadcast %50 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
135 |
+
%53 = arith.addi %52, %22 : tensor<64x8xi32, #blocked>
|
136 |
+
%54 = tt.addptr %23, %53 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
|
137 |
+
%55 = tt.broadcast %51 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
|
138 |
+
%56 = tt.load %54, %55, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
|
139 |
+
%57 = arith.addi %52, %25 : tensor<64x8xi32, #blocked>
|
140 |
+
%58 = tt.addptr %26, %57 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
|
141 |
+
%59 = tt.load %58, %55, %cst_16 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
|
142 |
+
%60 = arith.extf %59 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked>
|
143 |
+
%61 = tt.addptr %44, %50 : tensor<1x8x!tt.ptr<f32, 1>, #blocked>, tensor<1x8xi32, #blocked>
|
144 |
+
%62 = tt.load %61, %51, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x8xf32, #blocked>
|
145 |
+
tt.assert %35, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
|
146 |
+
%63 = arith.extsi %50 : tensor<1x8xi32, #blocked> to tensor<1x8xi64, #blocked>
|
147 |
+
%64 = tt.broadcast %63 : (tensor<1x8xi64, #blocked>) -> tensor<64x8xi64, #blocked>
|
148 |
+
%65 = arith.addi %64, %37 : tensor<64x8xi64, #blocked>
|
149 |
+
%66 = tt.addptr %38, %65 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi64, #blocked>
|
150 |
+
%67 = tt.load %66, %55, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
|
151 |
+
%68 = arith.addf %67, %56 : tensor<64x8xf32, #blocked>
|
152 |
+
%69 = arith.addf %68, %60 : tensor<64x8xf32, #blocked>
|
153 |
+
%70 = arith.subf %69, %45 : tensor<64x8xf32, #blocked>
|
154 |
+
%71 = tt.extern_elementwise %47 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked>) -> tensor<64x1xf32, #blocked>
|
155 |
+
%72 = tt.broadcast %71 : (tensor<64x1xf32, #blocked>) -> tensor<64x8xf32, #blocked>
|
156 |
+
%73 = arith.mulf %70, %72 : tensor<64x8xf32, #blocked>
|
157 |
+
%74 = tt.broadcast %62 : (tensor<1x8xf32, #blocked>) -> tensor<64x8xf32, #blocked>
|
158 |
+
%75 = arith.mulf %73, %74 : tensor<64x8xf32, #blocked>
|
159 |
+
%76 = tt.addptr %48, %57 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
|
160 |
+
%77 = arith.truncf %75 : tensor<64x8xf32, #blocked> to tensor<64x8xbf16, #blocked>
|
161 |
+
tt.store %76, %77, %55 {cache = 1 : i32, evict = 1 : i32} : tensor<64x8xbf16, #blocked>
|
162 |
+
}
|
163 |
+
tt.return
|
164 |
+
}
|
165 |
+
}
|
.triton/dump/0ef13ec90cf21db4d33a072ff09ec2d4/triton_.cubin
ADDED
Binary file (18.3 kB). View file
|
|
.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ptx
ADDED
@@ -0,0 +1,764 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1de
|
10 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
11 |
+
|
12 |
+
.visible .entry triton__0d1de(
|
13 |
+
.param .u64 triton__0d1de_param_0,
|
14 |
+
.param .u32 triton__0d1de_param_1
|
15 |
+
)
|
16 |
+
.maxntid 128, 1, 1
|
17 |
+
{
|
18 |
+
.reg .pred %p<27>;
|
19 |
+
.reg .b16 %rs<17>;
|
20 |
+
.reg .b32 %r<67>;
|
21 |
+
.reg .f32 %f<431>;
|
22 |
+
.reg .b64 %rd<6>;
|
23 |
+
.loc 1 18 0
|
24 |
+
$L__func_begin0:
|
25 |
+
.loc 1 18 0
|
26 |
+
|
27 |
+
ld.param.u64 %rd3, [triton__0d1de_param_0];
|
28 |
+
$L__tmp0:
|
29 |
+
.loc 1 21 36
|
30 |
+
mov.u32 %r14, %tid.x;
|
31 |
+
shl.b32 %r15, %r14, 3;
|
32 |
+
and.b32 %r16, %r15, 1016;
|
33 |
+
.loc 1 20 28
|
34 |
+
mov.u32 %r1, %ctaid.x;
|
35 |
+
.loc 1 20 33
|
36 |
+
shl.b32 %r17, %r1, 10;
|
37 |
+
.loc 1 21 23
|
38 |
+
or.b32 %r18, %r17, %r16;
|
39 |
+
.loc 1 24 34
|
40 |
+
mul.wide.s32 %rd4, %r18, 2;
|
41 |
+
add.s64 %rd5, %rd3, %rd4;
|
42 |
+
mov.pred %p1, -1;
|
43 |
+
.loc 1 24 39
|
44 |
+
mov.u32 %r2, 0x0;
|
45 |
+
mov.u32 %r3, 0x0;
|
46 |
+
mov.u32 %r4, 0x0;
|
47 |
+
mov.u32 %r5, 0x0;
|
48 |
+
@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd5 + 0 ];
|
49 |
+
cvt.u16.u32 %rs1, %r2;
|
50 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
|
51 |
+
cvt.u16.u32 %rs3, %r3;
|
52 |
+
.loc 1 24 48
|
53 |
+
cvt.f32.bf16 %r6, %rs1;
|
54 |
+
mov.b32 %f1, %r6;
|
55 |
+
cvt.f32.bf16 %r7, %rs2;
|
56 |
+
mov.b32 %f2, %r7;
|
57 |
+
.loc 1 29 18
|
58 |
+
mul.f32 %f9, %f1, 0f3F3504F3;
|
59 |
+
.loc 1 30 23
|
60 |
+
abs.ftz.f32 %f17, %f9;
|
61 |
+
setp.ge.f32 %p2, %f17, 0f3F8060FE;
|
62 |
+
mov.f32 %f365, 0f3789CA3C;
|
63 |
+
mov.f32 %f364, 0fB9F560B9;
|
64 |
+
mov.f32 %f363, 0f3BAC840B;
|
65 |
+
mov.f32 %f362, 0fBD0C8162;
|
66 |
+
mov.f32 %f361, 0f3E1CF906;
|
67 |
+
mov.f32 %f360, 0f3F6A937E;
|
68 |
+
mov.f32 %f359, 0f3F20D842;
|
69 |
+
mov.f32 %f366, %f17;
|
70 |
+
@%p2 bra $L__BB0_2;
|
71 |
+
.loc 1 0 23
|
72 |
+
mov.f32 %f365, 0f38B1E96A;
|
73 |
+
mov.f32 %f364, 0fBA574D20;
|
74 |
+
mov.f32 %f363, 0f3BAAD5EA;
|
75 |
+
mov.f32 %f362, 0fBCDC1BE7;
|
76 |
+
mov.f32 %f361, 0f3DE718AF;
|
77 |
+
mov.f32 %f360, 0fBEC093AC;
|
78 |
+
mov.f32 %f359, 0f3E0375D3;
|
79 |
+
.loc 1 30 23
|
80 |
+
mul.f32 %f366, %f9, %f9;
|
81 |
+
$L__BB0_2:
|
82 |
+
.loc 1 0 0
|
83 |
+
cvt.f32.bf16 %r8, %rs3;
|
84 |
+
mul.f32 %f10, %f2, 0f3F3504F3;
|
85 |
+
.loc 1 30 23
|
86 |
+
setp.ltu.f32 %p3, %f17, 0f3F8060FE;
|
87 |
+
fma.rn.ftz.f32 %f135, %f365, %f366, %f364;
|
88 |
+
fma.rn.ftz.f32 %f136, %f135, %f366, %f363;
|
89 |
+
fma.rn.ftz.f32 %f137, %f136, %f366, %f362;
|
90 |
+
fma.rn.ftz.f32 %f138, %f137, %f366, %f361;
|
91 |
+
fma.rn.ftz.f32 %f139, %f138, %f366, %f360;
|
92 |
+
fma.rn.ftz.f32 %f140, %f139, %f366, %f359;
|
93 |
+
neg.f32 %f141, %f366;
|
94 |
+
selp.f32 %f142, %f141, %f9, %p2;
|
95 |
+
fma.rn.ftz.f32 %f367, %f140, %f142, %f142;
|
96 |
+
mov.f32 %f358, 0f3F800000;
|
97 |
+
@%p3 bra $L__BB0_4;
|
98 |
+
ex2.approx.ftz.f32 %f143, %f367;
|
99 |
+
sub.f32 %f145, %f358, %f143;
|
100 |
+
mov.b32 %r19, %f145;
|
101 |
+
mov.b32 %r20, %f9;
|
102 |
+
and.b32 %r21, %r20, -2147483648;
|
103 |
+
or.b32 %r22, %r21, %r19;
|
104 |
+
mov.b32 %f367, %r22;
|
105 |
+
$L__BB0_4:
|
106 |
+
.loc 1 0 0
|
107 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
|
108 |
+
mov.b32 %f3, %r8;
|
109 |
+
.loc 1 30 23
|
110 |
+
abs.ftz.f32 %f30, %f10;
|
111 |
+
setp.ge.f32 %p5, %f30, 0f3F8060FE;
|
112 |
+
mov.f32 %f374, 0f3789CA3C;
|
113 |
+
mov.f32 %f373, 0fB9F560B9;
|
114 |
+
mov.f32 %f372, 0f3BAC840B;
|
115 |
+
mov.f32 %f371, 0fBD0C8162;
|
116 |
+
mov.f32 %f370, 0f3E1CF906;
|
117 |
+
mov.f32 %f369, 0f3F6A937E;
|
118 |
+
mov.f32 %f368, 0f3F20D842;
|
119 |
+
mov.f32 %f375, %f30;
|
120 |
+
@%p5 bra $L__BB0_6;
|
121 |
+
mul.f32 %f375, %f10, %f10;
|
122 |
+
mov.f32 %f374, 0f38B1E96A;
|
123 |
+
mov.f32 %f373, 0fBA574D20;
|
124 |
+
mov.f32 %f372, 0f3BAAD5EA;
|
125 |
+
mov.f32 %f371, 0fBCDC1BE7;
|
126 |
+
mov.f32 %f370, 0f3DE718AF;
|
127 |
+
mov.f32 %f369, 0fBEC093AC;
|
128 |
+
mov.f32 %f368, 0f3E0375D3;
|
129 |
+
$L__BB0_6:
|
130 |
+
.loc 1 0 0
|
131 |
+
cvt.f32.bf16 %r9, %rs4;
|
132 |
+
mul.f32 %f11, %f3, 0f3F3504F3;
|
133 |
+
.loc 1 30 23
|
134 |
+
setp.ltu.f32 %p6, %f30, 0f3F8060FE;
|
135 |
+
fma.rn.ftz.f32 %f160, %f374, %f375, %f373;
|
136 |
+
fma.rn.ftz.f32 %f161, %f160, %f375, %f372;
|
137 |
+
fma.rn.ftz.f32 %f162, %f161, %f375, %f371;
|
138 |
+
fma.rn.ftz.f32 %f163, %f162, %f375, %f370;
|
139 |
+
fma.rn.ftz.f32 %f164, %f163, %f375, %f369;
|
140 |
+
fma.rn.ftz.f32 %f165, %f164, %f375, %f368;
|
141 |
+
neg.f32 %f166, %f375;
|
142 |
+
selp.f32 %f167, %f166, %f10, %p5;
|
143 |
+
fma.rn.ftz.f32 %f376, %f165, %f167, %f167;
|
144 |
+
@%p6 bra $L__BB0_8;
|
145 |
+
ex2.approx.ftz.f32 %f168, %f376;
|
146 |
+
sub.f32 %f170, %f358, %f168;
|
147 |
+
mov.b32 %r23, %f170;
|
148 |
+
mov.b32 %r24, %f10;
|
149 |
+
and.b32 %r25, %r24, -2147483648;
|
150 |
+
or.b32 %r26, %r25, %r23;
|
151 |
+
mov.b32 %f376, %r26;
|
152 |
+
$L__BB0_8:
|
153 |
+
.loc 1 0 0
|
154 |
+
cvt.u16.u32 %rs5, %r4;
|
155 |
+
mov.b32 %f4, %r9;
|
156 |
+
.loc 1 30 23
|
157 |
+
abs.ftz.f32 %f43, %f11;
|
158 |
+
setp.ge.f32 %p8, %f43, 0f3F8060FE;
|
159 |
+
mov.f32 %f383, 0f3789CA3C;
|
160 |
+
mov.f32 %f382, 0fB9F560B9;
|
161 |
+
mov.f32 %f381, 0f3BAC840B;
|
162 |
+
mov.f32 %f380, 0fBD0C8162;
|
163 |
+
mov.f32 %f379, 0f3E1CF906;
|
164 |
+
mov.f32 %f378, 0f3F6A937E;
|
165 |
+
mov.f32 %f377, 0f3F20D842;
|
166 |
+
mov.f32 %f384, %f43;
|
167 |
+
@%p8 bra $L__BB0_10;
|
168 |
+
mul.f32 %f384, %f11, %f11;
|
169 |
+
mov.f32 %f383, 0f38B1E96A;
|
170 |
+
mov.f32 %f382, 0fBA574D20;
|
171 |
+
mov.f32 %f381, 0f3BAAD5EA;
|
172 |
+
mov.f32 %f380, 0fBCDC1BE7;
|
173 |
+
mov.f32 %f379, 0f3DE718AF;
|
174 |
+
mov.f32 %f378, 0fBEC093AC;
|
175 |
+
mov.f32 %f377, 0f3E0375D3;
|
176 |
+
$L__BB0_10:
|
177 |
+
.loc 1 0 0
|
178 |
+
cvt.f32.bf16 %r10, %rs5;
|
179 |
+
mul.f32 %f12, %f4, 0f3F3504F3;
|
180 |
+
.loc 1 30 23
|
181 |
+
setp.ltu.f32 %p9, %f43, 0f3F8060FE;
|
182 |
+
fma.rn.ftz.f32 %f185, %f383, %f384, %f382;
|
183 |
+
fma.rn.ftz.f32 %f186, %f185, %f384, %f381;
|
184 |
+
fma.rn.ftz.f32 %f187, %f186, %f384, %f380;
|
185 |
+
fma.rn.ftz.f32 %f188, %f187, %f384, %f379;
|
186 |
+
fma.rn.ftz.f32 %f189, %f188, %f384, %f378;
|
187 |
+
fma.rn.ftz.f32 %f190, %f189, %f384, %f377;
|
188 |
+
neg.f32 %f191, %f384;
|
189 |
+
selp.f32 %f192, %f191, %f11, %p8;
|
190 |
+
fma.rn.ftz.f32 %f385, %f190, %f192, %f192;
|
191 |
+
@%p9 bra $L__BB0_12;
|
192 |
+
ex2.approx.ftz.f32 %f193, %f385;
|
193 |
+
sub.f32 %f195, %f358, %f193;
|
194 |
+
mov.b32 %r27, %f195;
|
195 |
+
mov.b32 %r28, %f11;
|
196 |
+
and.b32 %r29, %r28, -2147483648;
|
197 |
+
or.b32 %r30, %r29, %r27;
|
198 |
+
mov.b32 %f385, %r30;
|
199 |
+
$L__BB0_12:
|
200 |
+
.loc 1 0 0
|
201 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r4; }
|
202 |
+
mov.b32 %f5, %r10;
|
203 |
+
.loc 1 30 23
|
204 |
+
abs.ftz.f32 %f56, %f12;
|
205 |
+
setp.ge.f32 %p11, %f56, 0f3F8060FE;
|
206 |
+
mov.f32 %f392, 0f3789CA3C;
|
207 |
+
mov.f32 %f391, 0fB9F560B9;
|
208 |
+
mov.f32 %f390, 0f3BAC840B;
|
209 |
+
mov.f32 %f389, 0fBD0C8162;
|
210 |
+
mov.f32 %f388, 0f3E1CF906;
|
211 |
+
mov.f32 %f387, 0f3F6A937E;
|
212 |
+
mov.f32 %f386, 0f3F20D842;
|
213 |
+
mov.f32 %f393, %f56;
|
214 |
+
@%p11 bra $L__BB0_14;
|
215 |
+
mul.f32 %f393, %f12, %f12;
|
216 |
+
mov.f32 %f392, 0f38B1E96A;
|
217 |
+
mov.f32 %f391, 0fBA574D20;
|
218 |
+
mov.f32 %f390, 0f3BAAD5EA;
|
219 |
+
mov.f32 %f389, 0fBCDC1BE7;
|
220 |
+
mov.f32 %f388, 0f3DE718AF;
|
221 |
+
mov.f32 %f387, 0fBEC093AC;
|
222 |
+
mov.f32 %f386, 0f3E0375D3;
|
223 |
+
$L__BB0_14:
|
224 |
+
.loc 1 0 0
|
225 |
+
cvt.f32.bf16 %r11, %rs6;
|
226 |
+
mul.f32 %f13, %f5, 0f3F3504F3;
|
227 |
+
.loc 1 30 23
|
228 |
+
setp.ltu.f32 %p12, %f56, 0f3F8060FE;
|
229 |
+
fma.rn.ftz.f32 %f210, %f392, %f393, %f391;
|
230 |
+
fma.rn.ftz.f32 %f211, %f210, %f393, %f390;
|
231 |
+
fma.rn.ftz.f32 %f212, %f211, %f393, %f389;
|
232 |
+
fma.rn.ftz.f32 %f213, %f212, %f393, %f388;
|
233 |
+
fma.rn.ftz.f32 %f214, %f213, %f393, %f387;
|
234 |
+
fma.rn.ftz.f32 %f215, %f214, %f393, %f386;
|
235 |
+
neg.f32 %f216, %f393;
|
236 |
+
selp.f32 %f217, %f216, %f12, %p11;
|
237 |
+
fma.rn.ftz.f32 %f394, %f215, %f217, %f217;
|
238 |
+
@%p12 bra $L__BB0_16;
|
239 |
+
ex2.approx.ftz.f32 %f218, %f394;
|
240 |
+
sub.f32 %f220, %f358, %f218;
|
241 |
+
mov.b32 %r31, %f220;
|
242 |
+
mov.b32 %r32, %f12;
|
243 |
+
and.b32 %r33, %r32, -2147483648;
|
244 |
+
or.b32 %r34, %r33, %r31;
|
245 |
+
mov.b32 %f394, %r34;
|
246 |
+
$L__BB0_16:
|
247 |
+
.loc 1 0 0
|
248 |
+
cvt.u16.u32 %rs7, %r5;
|
249 |
+
mov.b32 %f6, %r11;
|
250 |
+
.loc 1 30 23
|
251 |
+
abs.ftz.f32 %f69, %f13;
|
252 |
+
setp.ge.f32 %p14, %f69, 0f3F8060FE;
|
253 |
+
mov.f32 %f401, 0f3789CA3C;
|
254 |
+
mov.f32 %f400, 0fB9F560B9;
|
255 |
+
mov.f32 %f399, 0f3BAC840B;
|
256 |
+
mov.f32 %f398, 0fBD0C8162;
|
257 |
+
mov.f32 %f397, 0f3E1CF906;
|
258 |
+
mov.f32 %f396, 0f3F6A937E;
|
259 |
+
mov.f32 %f395, 0f3F20D842;
|
260 |
+
mov.f32 %f402, %f69;
|
261 |
+
@%p14 bra $L__BB0_18;
|
262 |
+
mul.f32 %f402, %f13, %f13;
|
263 |
+
mov.f32 %f401, 0f38B1E96A;
|
264 |
+
mov.f32 %f400, 0fBA574D20;
|
265 |
+
mov.f32 %f399, 0f3BAAD5EA;
|
266 |
+
mov.f32 %f398, 0fBCDC1BE7;
|
267 |
+
mov.f32 %f397, 0f3DE718AF;
|
268 |
+
mov.f32 %f396, 0fBEC093AC;
|
269 |
+
mov.f32 %f395, 0f3E0375D3;
|
270 |
+
$L__BB0_18:
|
271 |
+
.loc 1 0 0
|
272 |
+
cvt.f32.bf16 %r12, %rs7;
|
273 |
+
mul.f32 %f14, %f6, 0f3F3504F3;
|
274 |
+
.loc 1 30 23
|
275 |
+
setp.ltu.f32 %p15, %f69, 0f3F8060FE;
|
276 |
+
fma.rn.ftz.f32 %f235, %f401, %f402, %f400;
|
277 |
+
fma.rn.ftz.f32 %f236, %f235, %f402, %f399;
|
278 |
+
fma.rn.ftz.f32 %f237, %f236, %f402, %f398;
|
279 |
+
fma.rn.ftz.f32 %f238, %f237, %f402, %f397;
|
280 |
+
fma.rn.ftz.f32 %f239, %f238, %f402, %f396;
|
281 |
+
fma.rn.ftz.f32 %f240, %f239, %f402, %f395;
|
282 |
+
neg.f32 %f241, %f402;
|
283 |
+
selp.f32 %f242, %f241, %f13, %p14;
|
284 |
+
fma.rn.ftz.f32 %f403, %f240, %f242, %f242;
|
285 |
+
@%p15 bra $L__BB0_20;
|
286 |
+
ex2.approx.ftz.f32 %f243, %f403;
|
287 |
+
sub.f32 %f245, %f358, %f243;
|
288 |
+
mov.b32 %r35, %f245;
|
289 |
+
mov.b32 %r36, %f13;
|
290 |
+
and.b32 %r37, %r36, -2147483648;
|
291 |
+
or.b32 %r38, %r37, %r35;
|
292 |
+
mov.b32 %f403, %r38;
|
293 |
+
$L__BB0_20:
|
294 |
+
.loc 1 0 0
|
295 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; }
|
296 |
+
mov.b32 %f7, %r12;
|
297 |
+
.loc 1 30 23
|
298 |
+
abs.ftz.f32 %f82, %f14;
|
299 |
+
setp.ge.f32 %p17, %f82, 0f3F8060FE;
|
300 |
+
mov.f32 %f410, 0f3789CA3C;
|
301 |
+
mov.f32 %f409, 0fB9F560B9;
|
302 |
+
mov.f32 %f408, 0f3BAC840B;
|
303 |
+
mov.f32 %f407, 0fBD0C8162;
|
304 |
+
mov.f32 %f406, 0f3E1CF906;
|
305 |
+
mov.f32 %f405, 0f3F6A937E;
|
306 |
+
mov.f32 %f404, 0f3F20D842;
|
307 |
+
mov.f32 %f411, %f82;
|
308 |
+
@%p17 bra $L__BB0_22;
|
309 |
+
mul.f32 %f411, %f14, %f14;
|
310 |
+
mov.f32 %f410, 0f38B1E96A;
|
311 |
+
mov.f32 %f409, 0fBA574D20;
|
312 |
+
mov.f32 %f408, 0f3BAAD5EA;
|
313 |
+
mov.f32 %f407, 0fBCDC1BE7;
|
314 |
+
mov.f32 %f406, 0f3DE718AF;
|
315 |
+
mov.f32 %f405, 0fBEC093AC;
|
316 |
+
mov.f32 %f404, 0f3E0375D3;
|
317 |
+
$L__BB0_22:
|
318 |
+
.loc 1 0 0
|
319 |
+
cvt.f32.bf16 %r13, %rs8;
|
320 |
+
mul.f32 %f15, %f7, 0f3F3504F3;
|
321 |
+
.loc 1 30 23
|
322 |
+
setp.ltu.f32 %p18, %f82, 0f3F8060FE;
|
323 |
+
fma.rn.ftz.f32 %f260, %f410, %f411, %f409;
|
324 |
+
fma.rn.ftz.f32 %f261, %f260, %f411, %f408;
|
325 |
+
fma.rn.ftz.f32 %f262, %f261, %f411, %f407;
|
326 |
+
fma.rn.ftz.f32 %f263, %f262, %f411, %f406;
|
327 |
+
fma.rn.ftz.f32 %f264, %f263, %f411, %f405;
|
328 |
+
fma.rn.ftz.f32 %f265, %f264, %f411, %f404;
|
329 |
+
neg.f32 %f266, %f411;
|
330 |
+
selp.f32 %f267, %f266, %f14, %p17;
|
331 |
+
fma.rn.ftz.f32 %f412, %f265, %f267, %f267;
|
332 |
+
@%p18 bra $L__BB0_24;
|
333 |
+
ex2.approx.ftz.f32 %f268, %f412;
|
334 |
+
sub.f32 %f270, %f358, %f268;
|
335 |
+
mov.b32 %r39, %f270;
|
336 |
+
mov.b32 %r40, %f14;
|
337 |
+
and.b32 %r41, %r40, -2147483648;
|
338 |
+
or.b32 %r42, %r41, %r39;
|
339 |
+
mov.b32 %f412, %r42;
|
340 |
+
$L__BB0_24:
|
341 |
+
.loc 1 0 0
|
342 |
+
mov.b32 %f8, %r13;
|
343 |
+
.loc 1 30 23
|
344 |
+
abs.ftz.f32 %f95, %f15;
|
345 |
+
setp.ge.f32 %p20, %f95, 0f3F8060FE;
|
346 |
+
mov.f32 %f419, 0f3789CA3C;
|
347 |
+
mov.f32 %f418, 0fB9F560B9;
|
348 |
+
mov.f32 %f417, 0f3BAC840B;
|
349 |
+
mov.f32 %f416, 0fBD0C8162;
|
350 |
+
mov.f32 %f415, 0f3E1CF906;
|
351 |
+
mov.f32 %f414, 0f3F6A937E;
|
352 |
+
mov.f32 %f413, 0f3F20D842;
|
353 |
+
mov.f32 %f420, %f95;
|
354 |
+
@%p20 bra $L__BB0_26;
|
355 |
+
mul.f32 %f420, %f15, %f15;
|
356 |
+
mov.f32 %f419, 0f38B1E96A;
|
357 |
+
mov.f32 %f418, 0fBA574D20;
|
358 |
+
mov.f32 %f417, 0f3BAAD5EA;
|
359 |
+
mov.f32 %f416, 0fBCDC1BE7;
|
360 |
+
mov.f32 %f415, 0f3DE718AF;
|
361 |
+
mov.f32 %f414, 0fBEC093AC;
|
362 |
+
mov.f32 %f413, 0f3E0375D3;
|
363 |
+
$L__BB0_26:
|
364 |
+
.loc 1 0 0
|
365 |
+
mul.f32 %f16, %f8, 0f3F3504F3;
|
366 |
+
.loc 1 30 23
|
367 |
+
setp.ltu.f32 %p21, %f95, 0f3F8060FE;
|
368 |
+
fma.rn.ftz.f32 %f285, %f419, %f420, %f418;
|
369 |
+
fma.rn.ftz.f32 %f286, %f285, %f420, %f417;
|
370 |
+
fma.rn.ftz.f32 %f287, %f286, %f420, %f416;
|
371 |
+
fma.rn.ftz.f32 %f288, %f287, %f420, %f415;
|
372 |
+
fma.rn.ftz.f32 %f289, %f288, %f420, %f414;
|
373 |
+
fma.rn.ftz.f32 %f290, %f289, %f420, %f413;
|
374 |
+
neg.f32 %f291, %f420;
|
375 |
+
selp.f32 %f292, %f291, %f15, %p20;
|
376 |
+
fma.rn.ftz.f32 %f421, %f290, %f292, %f292;
|
377 |
+
@%p21 bra $L__BB0_28;
|
378 |
+
ex2.approx.ftz.f32 %f293, %f421;
|
379 |
+
sub.f32 %f295, %f358, %f293;
|
380 |
+
mov.b32 %r43, %f295;
|
381 |
+
mov.b32 %r44, %f15;
|
382 |
+
and.b32 %r45, %r44, -2147483648;
|
383 |
+
or.b32 %r46, %r45, %r43;
|
384 |
+
mov.b32 %f421, %r46;
|
385 |
+
$L__BB0_28:
|
386 |
+
abs.ftz.f32 %f108, %f16;
|
387 |
+
setp.ge.f32 %p23, %f108, 0f3F8060FE;
|
388 |
+
mov.f32 %f428, 0f3789CA3C;
|
389 |
+
mov.f32 %f427, 0fB9F560B9;
|
390 |
+
mov.f32 %f426, 0f3BAC840B;
|
391 |
+
mov.f32 %f425, 0fBD0C8162;
|
392 |
+
mov.f32 %f424, 0f3E1CF906;
|
393 |
+
mov.f32 %f423, 0f3F6A937E;
|
394 |
+
mov.f32 %f422, 0f3F20D842;
|
395 |
+
mov.f32 %f429, %f108;
|
396 |
+
@%p23 bra $L__BB0_30;
|
397 |
+
mul.f32 %f429, %f16, %f16;
|
398 |
+
mov.f32 %f428, 0f38B1E96A;
|
399 |
+
mov.f32 %f427, 0fBA574D20;
|
400 |
+
mov.f32 %f426, 0f3BAAD5EA;
|
401 |
+
mov.f32 %f425, 0fBCDC1BE7;
|
402 |
+
mov.f32 %f424, 0f3DE718AF;
|
403 |
+
mov.f32 %f423, 0fBEC093AC;
|
404 |
+
mov.f32 %f422, 0f3E0375D3;
|
405 |
+
$L__BB0_30:
|
406 |
+
setp.ltu.f32 %p24, %f108, 0f3F8060FE;
|
407 |
+
fma.rn.ftz.f32 %f310, %f428, %f429, %f427;
|
408 |
+
fma.rn.ftz.f32 %f311, %f310, %f429, %f426;
|
409 |
+
fma.rn.ftz.f32 %f312, %f311, %f429, %f425;
|
410 |
+
fma.rn.ftz.f32 %f313, %f312, %f429, %f424;
|
411 |
+
fma.rn.ftz.f32 %f314, %f313, %f429, %f423;
|
412 |
+
fma.rn.ftz.f32 %f315, %f314, %f429, %f422;
|
413 |
+
neg.f32 %f316, %f429;
|
414 |
+
selp.f32 %f317, %f316, %f16, %p23;
|
415 |
+
fma.rn.ftz.f32 %f430, %f315, %f317, %f317;
|
416 |
+
@%p24 bra $L__BB0_32;
|
417 |
+
ex2.approx.ftz.f32 %f318, %f430;
|
418 |
+
sub.f32 %f320, %f358, %f318;
|
419 |
+
mov.b32 %r47, %f320;
|
420 |
+
mov.b32 %r48, %f16;
|
421 |
+
and.b32 %r49, %r48, -2147483648;
|
422 |
+
or.b32 %r50, %r49, %r47;
|
423 |
+
mov.b32 %f430, %r50;
|
424 |
+
$L__BB0_32:
|
425 |
+
.loc 1 27 18
|
426 |
+
mul.f32 %f321, %f8, 0f3F000000;
|
427 |
+
mul.f32 %f322, %f7, 0f3F000000;
|
428 |
+
mul.f32 %f323, %f6, 0f3F000000;
|
429 |
+
mul.f32 %f324, %f5, 0f3F000000;
|
430 |
+
mul.f32 %f325, %f4, 0f3F000000;
|
431 |
+
mul.f32 %f326, %f3, 0f3F000000;
|
432 |
+
mul.f32 %f327, %f2, 0f3F000000;
|
433 |
+
mul.f32 %f328, %f1, 0f3F000000;
|
434 |
+
.loc 1 32 18
|
435 |
+
add.f32 %f329, %f367, 0f3F800000;
|
436 |
+
add.f32 %f330, %f376, 0f3F800000;
|
437 |
+
add.f32 %f331, %f385, 0f3F800000;
|
438 |
+
add.f32 %f332, %f394, 0f3F800000;
|
439 |
+
add.f32 %f333, %f403, 0f3F800000;
|
440 |
+
add.f32 %f334, %f412, 0f3F800000;
|
441 |
+
add.f32 %f335, %f421, 0f3F800000;
|
442 |
+
add.f32 %f336, %f430, 0f3F800000;
|
443 |
+
.loc 1 33 18
|
444 |
+
mul.f32 %f337, %f328, %f329;
|
445 |
+
mul.f32 %f338, %f327, %f330;
|
446 |
+
mul.f32 %f339, %f326, %f331;
|
447 |
+
mul.f32 %f340, %f325, %f332;
|
448 |
+
mul.f32 %f341, %f324, %f333;
|
449 |
+
mul.f32 %f342, %f323, %f334;
|
450 |
+
mul.f32 %f343, %f322, %f335;
|
451 |
+
mul.f32 %f344, %f321, %f336;
|
452 |
+
.loc 1 35 40
|
453 |
+
mov.b32 %r51, %f337;
|
454 |
+
cvt.rn.bf16.f32 %rs9, %r51;
|
455 |
+
mov.b32 %r52, %f338;
|
456 |
+
cvt.rn.bf16.f32 %rs10, %r52;
|
457 |
+
mov.b32 %r53, %f339;
|
458 |
+
cvt.rn.bf16.f32 %rs11, %r53;
|
459 |
+
mov.b32 %r54, %f340;
|
460 |
+
cvt.rn.bf16.f32 %rs12, %r54;
|
461 |
+
mov.b32 %r55, %f341;
|
462 |
+
cvt.rn.bf16.f32 %rs13, %r55;
|
463 |
+
mov.b32 %r56, %f342;
|
464 |
+
cvt.rn.bf16.f32 %rs14, %r56;
|
465 |
+
mov.b32 %r57, %f343;
|
466 |
+
cvt.rn.bf16.f32 %rs15, %r57;
|
467 |
+
mov.b32 %r58, %f344;
|
468 |
+
cvt.rn.bf16.f32 %rs16, %r58;
|
469 |
+
mov.b32 %r63, {%rs9, %rs10};
|
470 |
+
mov.b32 %r64, {%rs11, %rs12};
|
471 |
+
mov.b32 %r65, {%rs13, %rs14};
|
472 |
+
mov.b32 %r66, {%rs15, %rs16};
|
473 |
+
@%p1 st.global.v4.b32 [ %rd5 + 0 ], { %r63, %r64, %r65, %r66 };
|
474 |
+
.loc 1 35 4
|
475 |
+
ret;
|
476 |
+
$L__tmp1:
|
477 |
+
$L__func_end0:
|
478 |
+
|
479 |
+
}
|
480 |
+
// .globl __nv_erff
|
481 |
+
.visible .func (.param .b32 func_retval0) __nv_erff(
|
482 |
+
.param .b32 __nv_erff_param_0
|
483 |
+
)
|
484 |
+
{
|
485 |
+
.reg .pred %p<4>;
|
486 |
+
.reg .b32 %r<5>;
|
487 |
+
.reg .f32 %f<49>;
|
488 |
+
$L__func_begin1:
|
489 |
+
|
490 |
+
ld.param.f32 %f14, [__nv_erff_param_0];
|
491 |
+
abs.ftz.f32 %f1, %f14;
|
492 |
+
setp.ge.f32 %p1, %f1, 0f3F8060FE;
|
493 |
+
mov.f32 %f46, 0f3789CA3C;
|
494 |
+
mov.f32 %f45, 0fB9F560B9;
|
495 |
+
mov.f32 %f44, 0f3BAC840B;
|
496 |
+
mov.f32 %f43, 0fBD0C8162;
|
497 |
+
mov.f32 %f42, 0f3E1CF906;
|
498 |
+
mov.f32 %f41, 0f3F6A937E;
|
499 |
+
mov.f32 %f40, 0f3F20D842;
|
500 |
+
mov.f32 %f47, %f1;
|
501 |
+
@%p1 bra $L__BB1_2;
|
502 |
+
mul.f32 %f47, %f14, %f14;
|
503 |
+
mov.f32 %f46, 0f38B1E96A;
|
504 |
+
mov.f32 %f45, 0fBA574D20;
|
505 |
+
mov.f32 %f44, 0f3BAAD5EA;
|
506 |
+
mov.f32 %f43, 0fBCDC1BE7;
|
507 |
+
mov.f32 %f42, 0f3DE718AF;
|
508 |
+
mov.f32 %f41, 0fBEC093AC;
|
509 |
+
mov.f32 %f40, 0f3E0375D3;
|
510 |
+
$L__BB1_2:
|
511 |
+
setp.ltu.f32 %p2, %f1, 0f3F8060FE;
|
512 |
+
fma.rn.ftz.f32 %f29, %f46, %f47, %f45;
|
513 |
+
fma.rn.ftz.f32 %f30, %f29, %f47, %f44;
|
514 |
+
fma.rn.ftz.f32 %f31, %f30, %f47, %f43;
|
515 |
+
fma.rn.ftz.f32 %f32, %f31, %f47, %f42;
|
516 |
+
fma.rn.ftz.f32 %f33, %f32, %f47, %f41;
|
517 |
+
fma.rn.ftz.f32 %f34, %f33, %f47, %f40;
|
518 |
+
neg.f32 %f35, %f47;
|
519 |
+
selp.f32 %f36, %f35, %f14, %p1;
|
520 |
+
fma.rn.ftz.f32 %f48, %f34, %f36, %f36;
|
521 |
+
@%p2 bra $L__BB1_4;
|
522 |
+
ex2.approx.ftz.f32 %f37, %f48;
|
523 |
+
mov.f32 %f38, 0f3F800000;
|
524 |
+
sub.f32 %f39, %f38, %f37;
|
525 |
+
mov.b32 %r1, %f39;
|
526 |
+
mov.b32 %r2, %f14;
|
527 |
+
and.b32 %r3, %r2, -2147483648;
|
528 |
+
or.b32 %r4, %r3, %r1;
|
529 |
+
mov.b32 %f48, %r4;
|
530 |
+
$L__BB1_4:
|
531 |
+
st.param.f32 [func_retval0+0], %f48;
|
532 |
+
ret;
|
533 |
+
$L__func_end1:
|
534 |
+
|
535 |
+
}
|
536 |
+
.file 1 "/tmp/torchinductor_root/kp/ckphrtdpgsxl7sfarkkzylhv4st3uhmzvg3u6z5excfp6ydybq74.py"
|
537 |
+
.section .debug_abbrev
|
538 |
+
{
|
539 |
+
.b8 1
|
540 |
+
.b8 17
|
541 |
+
.b8 1
|
542 |
+
.b8 37
|
543 |
+
.b8 8
|
544 |
+
.b8 19
|
545 |
+
.b8 5
|
546 |
+
.b8 3
|
547 |
+
.b8 8
|
548 |
+
.b8 16
|
549 |
+
.b8 6
|
550 |
+
.b8 27
|
551 |
+
.b8 8
|
552 |
+
.b8 180
|
553 |
+
.b8 66
|
554 |
+
.b8 12
|
555 |
+
.b8 17
|
556 |
+
.b8 1
|
557 |
+
.b8 18
|
558 |
+
.b8 1
|
559 |
+
.b8 0
|
560 |
+
.b8 0
|
561 |
+
.b8 2
|
562 |
+
.b8 46
|
563 |
+
.b8 0
|
564 |
+
.b8 17
|
565 |
+
.b8 1
|
566 |
+
.b8 18
|
567 |
+
.b8 1
|
568 |
+
.b8 64
|
569 |
+
.b8 10
|
570 |
+
.b8 135
|
571 |
+
.b8 64
|
572 |
+
.b8 8
|
573 |
+
.b8 3
|
574 |
+
.b8 8
|
575 |
+
.b8 58
|
576 |
+
.b8 11
|
577 |
+
.b8 59
|
578 |
+
.b8 11
|
579 |
+
.b8 63
|
580 |
+
.b8 12
|
581 |
+
.b8 0
|
582 |
+
.b8 0
|
583 |
+
.b8 0
|
584 |
+
}
|
585 |
+
.section .debug_info
|
586 |
+
{
|
587 |
+
.b32 172
|
588 |
+
.b8 2
|
589 |
+
.b8 0
|
590 |
+
.b32 .debug_abbrev
|
591 |
+
.b8 8
|
592 |
+
.b8 1
|
593 |
+
.b8 116
|
594 |
+
.b8 114
|
595 |
+
.b8 105
|
596 |
+
.b8 116
|
597 |
+
.b8 111
|
598 |
+
.b8 110
|
599 |
+
.b8 0
|
600 |
+
.b8 2
|
601 |
+
.b8 0
|
602 |
+
.b8 99
|
603 |
+
.b8 107
|
604 |
+
.b8 112
|
605 |
+
.b8 104
|
606 |
+
.b8 114
|
607 |
+
.b8 116
|
608 |
+
.b8 100
|
609 |
+
.b8 112
|
610 |
+
.b8 103
|
611 |
+
.b8 115
|
612 |
+
.b8 120
|
613 |
+
.b8 108
|
614 |
+
.b8 55
|
615 |
+
.b8 115
|
616 |
+
.b8 102
|
617 |
+
.b8 97
|
618 |
+
.b8 114
|
619 |
+
.b8 107
|
620 |
+
.b8 107
|
621 |
+
.b8 122
|
622 |
+
.b8 121
|
623 |
+
.b8 108
|
624 |
+
.b8 104
|
625 |
+
.b8 118
|
626 |
+
.b8 52
|
627 |
+
.b8 115
|
628 |
+
.b8 116
|
629 |
+
.b8 51
|
630 |
+
.b8 117
|
631 |
+
.b8 104
|
632 |
+
.b8 109
|
633 |
+
.b8 122
|
634 |
+
.b8 118
|
635 |
+
.b8 103
|
636 |
+
.b8 51
|
637 |
+
.b8 117
|
638 |
+
.b8 54
|
639 |
+
.b8 122
|
640 |
+
.b8 53
|
641 |
+
.b8 101
|
642 |
+
.b8 120
|
643 |
+
.b8 99
|
644 |
+
.b8 102
|
645 |
+
.b8 112
|
646 |
+
.b8 54
|
647 |
+
.b8 121
|
648 |
+
.b8 100
|
649 |
+
.b8 121
|
650 |
+
.b8 98
|
651 |
+
.b8 113
|
652 |
+
.b8 55
|
653 |
+
.b8 52
|
654 |
+
.b8 46
|
655 |
+
.b8 112
|
656 |
+
.b8 121
|
657 |
+
.b8 0
|
658 |
+
.b32 .debug_line
|
659 |
+
.b8 47
|
660 |
+
.b8 116
|
661 |
+
.b8 109
|
662 |
+
.b8 112
|
663 |
+
.b8 47
|
664 |
+
.b8 116
|
665 |
+
.b8 111
|
666 |
+
.b8 114
|
667 |
+
.b8 99
|
668 |
+
.b8 104
|
669 |
+
.b8 105
|
670 |
+
.b8 110
|
671 |
+
.b8 100
|
672 |
+
.b8 117
|
673 |
+
.b8 99
|
674 |
+
.b8 116
|
675 |
+
.b8 111
|
676 |
+
.b8 114
|
677 |
+
.b8 95
|
678 |
+
.b8 114
|
679 |
+
.b8 111
|
680 |
+
.b8 111
|
681 |
+
.b8 116
|
682 |
+
.b8 47
|
683 |
+
.b8 107
|
684 |
+
.b8 112
|
685 |
+
.b8 0
|
686 |
+
.b8 1
|
687 |
+
.b64 $L__func_begin0
|
688 |
+
.b64 $L__func_end0
|
689 |
+
.b8 2
|
690 |
+
.b64 $L__func_begin0
|
691 |
+
.b64 $L__func_end0
|
692 |
+
.b8 1
|
693 |
+
.b8 156
|
694 |
+
.b8 116
|
695 |
+
.b8 114
|
696 |
+
.b8 105
|
697 |
+
.b8 116
|
698 |
+
.b8 111
|
699 |
+
.b8 110
|
700 |
+
.b8 95
|
701 |
+
.b8 95
|
702 |
+
.b8 48
|
703 |
+
.b8 100
|
704 |
+
.b8 49
|
705 |
+
.b8 100
|
706 |
+
.b8 101
|
707 |
+
.b8 0
|
708 |
+
.b8 116
|
709 |
+
.b8 114
|
710 |
+
.b8 105
|
711 |
+
.b8 116
|
712 |
+
.b8 111
|
713 |
+
.b8 110
|
714 |
+
.b8 95
|
715 |
+
.b8 95
|
716 |
+
.b8 48
|
717 |
+
.b8 100
|
718 |
+
.b8 49
|
719 |
+
.b8 100
|
720 |
+
.b8 101
|
721 |
+
.b8 0
|
722 |
+
.b8 1
|
723 |
+
.b8 18
|
724 |
+
.b8 1
|
725 |
+
.b8 0
|
726 |
+
}
|
727 |
+
.section .debug_pubnames
|
728 |
+
{
|
729 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
730 |
+
$L__pubNames_start0:
|
731 |
+
.b8 2
|
732 |
+
.b8 0
|
733 |
+
.b32 .debug_info
|
734 |
+
.b32 176
|
735 |
+
.b32 125
|
736 |
+
.b8 116
|
737 |
+
.b8 114
|
738 |
+
.b8 105
|
739 |
+
.b8 116
|
740 |
+
.b8 111
|
741 |
+
.b8 110
|
742 |
+
.b8 95
|
743 |
+
.b8 95
|
744 |
+
.b8 48
|
745 |
+
.b8 100
|
746 |
+
.b8 49
|
747 |
+
.b8 100
|
748 |
+
.b8 101
|
749 |
+
.b8 0
|
750 |
+
.b32 0
|
751 |
+
$L__pubNames_end0:
|
752 |
+
}
|
753 |
+
.section .debug_pubtypes
|
754 |
+
{
|
755 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
756 |
+
$L__pubTypes_start0:
|
757 |
+
.b8 2
|
758 |
+
.b8 0
|
759 |
+
.b32 .debug_info
|
760 |
+
.b32 176
|
761 |
+
.b32 0
|
762 |
+
$L__pubTypes_end0:
|
763 |
+
}
|
764 |
+
.section .debug_loc { }
|
.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttgir
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<1.000000e+00> : tensor<1024xf32, #blocked>
|
5 |
+
%cst_0 = arith.constant dense<0.707106769> : tensor<1024xf32, #blocked>
|
6 |
+
%cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32, #blocked>
|
7 |
+
%c1024_i32 = arith.constant 1024 : i32
|
8 |
+
%0 = tt.get_program_id x : i32
|
9 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
10 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
|
11 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
|
12 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
|
13 |
+
%5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
|
14 |
+
%6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
|
15 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
|
16 |
+
%8 = arith.extf %7 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked>
|
17 |
+
%9 = arith.mulf %8, %cst_1 : tensor<1024xf32, #blocked>
|
18 |
+
%10 = arith.mulf %8, %cst_0 : tensor<1024xf32, #blocked>
|
19 |
+
%11 = tt.extern_elementwise %10 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32, #blocked>) -> tensor<1024xf32, #blocked>
|
20 |
+
%12 = arith.addf %11, %cst : tensor<1024xf32, #blocked>
|
21 |
+
%13 = arith.mulf %9, %12 : tensor<1024xf32, #blocked>
|
22 |
+
%14 = arith.truncf %13 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked>
|
23 |
+
tt.store %6, %14 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked>
|
24 |
+
tt.return
|
25 |
+
}
|
26 |
+
}
|
.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttir
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<1.000000e+00> : tensor<1024xf32>
|
4 |
+
%cst_0 = arith.constant dense<0.707106769> : tensor<1024xf32>
|
5 |
+
%cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32>
|
6 |
+
%c1024_i32 = arith.constant 1024 : i32
|
7 |
+
%0 = tt.get_program_id x : i32
|
8 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
9 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
|
10 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32>
|
11 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32>
|
12 |
+
%5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
|
13 |
+
%6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
|
14 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
|
15 |
+
%8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32>
|
16 |
+
%9 = arith.mulf %8, %cst_1 : tensor<1024xf32>
|
17 |
+
%10 = arith.mulf %8, %cst_0 : tensor<1024xf32>
|
18 |
+
%11 = tt.extern_elementwise %10 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32>) -> tensor<1024xf32>
|
19 |
+
%12 = arith.addf %11, %cst : tensor<1024xf32>
|
20 |
+
%13 = arith.mulf %9, %12 : tensor<1024xf32>
|
21 |
+
%14 = arith.truncf %13 : tensor<1024xf32> to tensor<1024xbf16>
|
22 |
+
tt.store %6, %14 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16>
|
23 |
+
tt.return
|
24 |
+
}
|
25 |
+
}
|
.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.cubin
ADDED
Binary file (14.6 kB). View file
|
|
.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttgir
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
|
5 |
+
%cst = arith.constant dense<256> : tensor<32x1xi64, #blocked>
|
6 |
+
%cst_0 = arith.constant dense<0> : tensor<32x1xi64, #blocked>
|
7 |
+
%cst_1 = arith.constant dense<512> : tensor<32x1xi64, #blocked>
|
8 |
+
%cst_2 = arith.constant dense<256> : tensor<32x1xi32, #blocked>
|
9 |
+
%cst_3 = arith.constant dense<131072> : tensor<1x128xi32, #blocked1>
|
10 |
+
%cst_4 = arith.constant dense<120> : tensor<1x128xi32, #blocked1>
|
11 |
+
%cst_5 = arith.constant dense<0.000000e+00> : tensor<32x128xf32, #blocked1>
|
12 |
+
%cst_6 = arith.constant dense<true> : tensor<32x1xi1, #blocked>
|
13 |
+
%c32_i32 = arith.constant 32 : i32
|
14 |
+
%0 = tt.get_program_id x : i32
|
15 |
+
%1 = arith.muli %0, %c32_i32 : i32
|
16 |
+
%2 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
17 |
+
%3 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
18 |
+
%4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<32x1xi32, #blocked1>
|
19 |
+
%5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<32x1xi32, #blocked>
|
20 |
+
%6 = tt.splat %1 : (i32) -> tensor<32x1xi32, #blocked1>
|
21 |
+
%7 = tt.splat %1 : (i32) -> tensor<32x1xi32, #blocked>
|
22 |
+
%8 = arith.addi %6, %4 : tensor<32x1xi32, #blocked1>
|
23 |
+
%9 = arith.addi %7, %5 : tensor<32x1xi32, #blocked>
|
24 |
+
%10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
|
25 |
+
%11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x128xi32, #blocked1>
|
26 |
+
%12 = arith.cmpi slt, %11, %cst_4 : tensor<1x128xi32, #blocked1>
|
27 |
+
%13 = arith.muli %11, %cst_3 : tensor<1x128xi32, #blocked1>
|
28 |
+
%14 = tt.broadcast %8 : (tensor<32x1xi32, #blocked1>) -> tensor<32x128xi32, #blocked1>
|
29 |
+
%15 = tt.broadcast %13 : (tensor<1x128xi32, #blocked1>) -> tensor<32x128xi32, #blocked1>
|
30 |
+
%16 = arith.addi %14, %15 : tensor<32x128xi32, #blocked1>
|
31 |
+
%17 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<32x128x!tt.ptr<f32, 1>, #blocked1>
|
32 |
+
%18 = tt.addptr %17, %16 : tensor<32x128x!tt.ptr<f32, 1>, #blocked1>, tensor<32x128xi32, #blocked1>
|
33 |
+
%19 = tt.broadcast %12 : (tensor<1x128xi1, #blocked1>) -> tensor<32x128xi1, #blocked1>
|
34 |
+
%20 = tt.load %18, %19, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<32x128xf32, #blocked1>
|
35 |
+
%21 = arith.addf %20, %cst_5 : tensor<32x128xf32, #blocked1>
|
36 |
+
%22 = arith.select %19, %21, %cst_5 : tensor<32x128xi1, #blocked1>, tensor<32x128xf32, #blocked1>
|
37 |
+
%23 = "tt.reduce"(%22) <{axis = 1 : i32}> ({
|
38 |
+
^bb0(%arg5: f32, %arg6: f32):
|
39 |
+
%40 = arith.addf %arg5, %arg6 : f32
|
40 |
+
tt.reduce.return %40 : f32
|
41 |
+
}) : (tensor<32x128xf32, #blocked1>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
42 |
+
%24 = triton_gpu.convert_layout %23 : (tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
43 |
+
%25 = tt.expand_dims %24 {axis = 1 : i32} : (tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<32x1xf32, #blocked>
|
44 |
+
%26 = arith.divsi %9, %cst_2 : tensor<32x1xi32, #blocked>
|
45 |
+
%27 = arith.remsi %9, %cst_2 : tensor<32x1xi32, #blocked>
|
46 |
+
%28 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<32x1x!tt.ptr<i64, 1>, #blocked>
|
47 |
+
%29 = tt.addptr %28, %26 : tensor<32x1x!tt.ptr<i64, 1>, #blocked>, tensor<32x1xi32, #blocked>
|
48 |
+
%30 = tt.load %29 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<32x1xi64, #blocked>
|
49 |
+
%31 = arith.addi %30, %cst_1 : tensor<32x1xi64, #blocked>
|
50 |
+
%32 = arith.cmpi slt, %30, %cst_0 : tensor<32x1xi64, #blocked>
|
51 |
+
%33 = arith.select %32, %31, %30 : tensor<32x1xi1, #blocked>, tensor<32x1xi64, #blocked>
|
52 |
+
%34 = arith.muli %33, %cst : tensor<32x1xi64, #blocked>
|
53 |
+
%35 = arith.extsi %27 : tensor<32x1xi32, #blocked> to tensor<32x1xi64, #blocked>
|
54 |
+
%36 = arith.addi %35, %34 : tensor<32x1xi64, #blocked>
|
55 |
+
%37 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<32x1x!tt.ptr<f32, 1>, #blocked>
|
56 |
+
%38 = tt.addptr %37, %36 : tensor<32x1x!tt.ptr<f32, 1>, #blocked>, tensor<32x1xi64, #blocked>
|
57 |
+
%39 = "tt.atomic_rmw"(%38, %25, %cst_6) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<32x1x!tt.ptr<f32, 1>, #blocked>, tensor<32x1xf32, #blocked>, tensor<32x1xi1, #blocked>) -> tensor<32x1xf32, #blocked>
|
58 |
+
tt.return
|
59 |
+
}
|
60 |
+
}
|
.triton/dump/199215289adb100508718a5a762ba4d7/triton_.llir
ADDED
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
|
5 |
+
@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
|
6 |
+
@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp7 < 50257"
|
7 |
+
@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
|
8 |
+
|
9 |
+
declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
|
10 |
+
|
11 |
+
define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2) local_unnamed_addr !dbg !7 {
|
12 |
+
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
13 |
+
%5 = and i32 %4, 127, !dbg !10
|
14 |
+
%6 = shl nuw nsw i32 %5, 1, !dbg !10
|
15 |
+
%7 = or i32 %6, 1, !dbg !10
|
16 |
+
%8 = or i32 %6, 256, !dbg !10
|
17 |
+
%9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !11
|
18 |
+
%10 = sext i32 %9 to i64, !dbg !12
|
19 |
+
%11 = shl nsw i64 %10, 9, !dbg !13
|
20 |
+
%12 = zext nneg i32 %6 to i64
|
21 |
+
%13 = zext nneg i32 %8 to i64
|
22 |
+
%14 = or i64 %11, %12, !dbg !14
|
23 |
+
%15 = or i64 %11, %13, !dbg !14
|
24 |
+
%16 = getelementptr i64, ptr addrspace(1) %0, i64 %14, !dbg !15
|
25 |
+
%17 = getelementptr i64, ptr addrspace(1) %0, i64 %15, !dbg !15
|
26 |
+
%18 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %16, i1 true) #2, !dbg !16
|
27 |
+
%19 = extractvalue { i64, i64 } %18, 0, !dbg !16
|
28 |
+
%20 = extractvalue { i64, i64 } %18, 1, !dbg !16
|
29 |
+
%21 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %17, i1 true) #2, !dbg !16
|
30 |
+
%22 = extractvalue { i64, i64 } %21, 0, !dbg !16
|
31 |
+
%23 = extractvalue { i64, i64 } %21, 1, !dbg !16
|
32 |
+
%24 = insertelement <4 x i64> poison, i64 %23, i64 0, !dbg !17
|
33 |
+
%25 = insertelement <4 x i64> %24, i64 %22, i64 1, !dbg !17
|
34 |
+
%26 = insertelement <4 x i64> %25, i64 %20, i64 2, !dbg !17
|
35 |
+
%27 = insertelement <4 x i64> %26, i64 %19, i64 3, !dbg !17
|
36 |
+
%28 = icmp eq <4 x i64> %27, <i64 -1, i64 -1, i64 -1, i64 -1>, !dbg !17
|
37 |
+
%29 = select <4 x i1> %28, <4 x i64> zeroinitializer, <4 x i64> %27, !dbg !18
|
38 |
+
%30 = add <4 x i64> %29, <i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !19
|
39 |
+
%31 = icmp slt <4 x i64> %29, zeroinitializer, !dbg !20
|
40 |
+
%32 = select <4 x i1> %31, <4 x i64> %30, <4 x i64> %29, !dbg !21
|
41 |
+
%33 = icmp ult <4 x i64> %32, <i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !22
|
42 |
+
%34 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %12, !dbg !22
|
43 |
+
%35 = extractelement <4 x i1> %33, i64 3, !dbg !22
|
44 |
+
%36 = zext i1 %35 to i8, !dbg !22
|
45 |
+
%37 = insertelement <1 x i8> undef, i8 %36, i64 0, !dbg !22
|
46 |
+
store <1 x i8> %37, ptr addrspace(3) %34, align 1, !dbg !22
|
47 |
+
%38 = zext nneg i32 %7 to i64, !dbg !22
|
48 |
+
%39 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %38, !dbg !22
|
49 |
+
%40 = extractelement <4 x i1> %33, i64 2, !dbg !22
|
50 |
+
%41 = zext i1 %40 to i8, !dbg !22
|
51 |
+
%42 = insertelement <1 x i8> undef, i8 %41, i64 0, !dbg !22
|
52 |
+
store <1 x i8> %42, ptr addrspace(3) %39, align 1, !dbg !22
|
53 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !22
|
54 |
+
%43 = zext nneg i32 %5 to i64, !dbg !22
|
55 |
+
%44 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %43, !dbg !22
|
56 |
+
%45 = load i8, ptr addrspace(3) %44, align 1, !dbg !22
|
57 |
+
%46 = or i32 %5, 128, !dbg !22
|
58 |
+
%47 = zext nneg i32 %46 to i64, !dbg !22
|
59 |
+
%48 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %47, !dbg !22
|
60 |
+
%49 = load i8, ptr addrspace(3) %48, align 1, !dbg !22
|
61 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !22
|
62 |
+
%50 = extractelement <4 x i1> %33, i64 1, !dbg !22
|
63 |
+
%51 = zext i1 %50 to i8, !dbg !22
|
64 |
+
%52 = insertelement <1 x i8> undef, i8 %51, i64 0, !dbg !22
|
65 |
+
store <1 x i8> %52, ptr addrspace(3) %34, align 1, !dbg !22
|
66 |
+
%53 = extractelement <4 x i1> %33, i64 0, !dbg !22
|
67 |
+
%54 = zext i1 %53 to i8, !dbg !22
|
68 |
+
%55 = insertelement <1 x i8> undef, i8 %54, i64 0, !dbg !22
|
69 |
+
store <1 x i8> %55, ptr addrspace(3) %39, align 1, !dbg !22
|
70 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !22
|
71 |
+
%56 = load i8, ptr addrspace(3) %44, align 1, !dbg !22
|
72 |
+
%57 = load i8, ptr addrspace(3) %48, align 1, !dbg !22
|
73 |
+
%58 = insertelement <4 x i8> poison, i8 %49, i64 0, !dbg !22
|
74 |
+
%59 = insertelement <4 x i8> %58, i8 %45, i64 1, !dbg !22
|
75 |
+
%60 = insertelement <4 x i8> %59, i8 %56, i64 2, !dbg !22
|
76 |
+
%61 = insertelement <4 x i8> %60, i8 %57, i64 3, !dbg !22
|
77 |
+
%62 = icmp eq <4 x i8> %61, zeroinitializer, !dbg !22
|
78 |
+
%63 = bitcast <4 x i1> %62 to i4, !dbg !23
|
79 |
+
%.not = icmp eq i4 %63, 0, !dbg !23
|
80 |
+
br i1 %.not, label %65, label %64, !dbg !23
|
81 |
+
|
82 |
+
64: ; preds = %3
|
83 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !23
|
84 |
+
br label %65, !dbg !23
|
85 |
+
|
86 |
+
65: ; preds = %64, %3
|
87 |
+
%66 = or i32 %6, 257, !dbg !10
|
88 |
+
%67 = zext nneg i32 %66 to i64
|
89 |
+
%68 = or i64 %11, %67, !dbg !14
|
90 |
+
%69 = or i64 %11, %38, !dbg !14
|
91 |
+
%70 = mul nsw i64 %14, 50257, !dbg !24
|
92 |
+
%71 = mul nsw i64 %69, 50257, !dbg !24
|
93 |
+
%72 = mul nsw i64 %15, 50257, !dbg !24
|
94 |
+
%73 = mul nsw i64 %68, 50257, !dbg !24
|
95 |
+
%74 = extractelement <4 x i64> %32, i64 3, !dbg !25
|
96 |
+
%75 = getelementptr float, ptr addrspace(1) %1, i64 %74, !dbg !25
|
97 |
+
%76 = getelementptr float, ptr addrspace(1) %75, i64 %70, !dbg !25
|
98 |
+
%77 = extractelement <4 x i64> %32, i64 2, !dbg !25
|
99 |
+
%78 = getelementptr float, ptr addrspace(1) %1, i64 %77, !dbg !25
|
100 |
+
%79 = getelementptr float, ptr addrspace(1) %78, i64 %71, !dbg !25
|
101 |
+
%80 = extractelement <4 x i64> %32, i64 1, !dbg !25
|
102 |
+
%81 = getelementptr float, ptr addrspace(1) %1, i64 %80, !dbg !25
|
103 |
+
%82 = getelementptr float, ptr addrspace(1) %81, i64 %72, !dbg !25
|
104 |
+
%83 = extractelement <4 x i64> %32, i64 0, !dbg !25
|
105 |
+
%84 = getelementptr float, ptr addrspace(1) %1, i64 %83, !dbg !25
|
106 |
+
%85 = getelementptr float, ptr addrspace(1) %84, i64 %73, !dbg !25
|
107 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !26
|
108 |
+
%86 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %12, !dbg !26
|
109 |
+
%87 = ptrtoint ptr addrspace(1) %76 to i64, !dbg !26
|
110 |
+
%88 = insertelement <1 x i64> undef, i64 %87, i64 0, !dbg !26
|
111 |
+
store <1 x i64> %88, ptr addrspace(3) %86, align 8, !dbg !26
|
112 |
+
%89 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %38, !dbg !26
|
113 |
+
%90 = ptrtoint ptr addrspace(1) %79 to i64, !dbg !26
|
114 |
+
%91 = insertelement <1 x i64> undef, i64 %90, i64 0, !dbg !26
|
115 |
+
store <1 x i64> %91, ptr addrspace(3) %89, align 8, !dbg !26
|
116 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !26
|
117 |
+
%92 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %43, !dbg !26
|
118 |
+
%93 = load i64, ptr addrspace(3) %92, align 8, !dbg !26
|
119 |
+
%94 = inttoptr i64 %93 to ptr addrspace(1), !dbg !26
|
120 |
+
%95 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %47, !dbg !26
|
121 |
+
%96 = load i64, ptr addrspace(3) %95, align 8, !dbg !26
|
122 |
+
%97 = inttoptr i64 %96 to ptr addrspace(1), !dbg !26
|
123 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !26
|
124 |
+
%98 = ptrtoint ptr addrspace(1) %82 to i64, !dbg !26
|
125 |
+
%99 = insertelement <1 x i64> undef, i64 %98, i64 0, !dbg !26
|
126 |
+
store <1 x i64> %99, ptr addrspace(3) %86, align 8, !dbg !26
|
127 |
+
%100 = ptrtoint ptr addrspace(1) %85 to i64, !dbg !26
|
128 |
+
%101 = insertelement <1 x i64> undef, i64 %100, i64 0, !dbg !26
|
129 |
+
store <1 x i64> %101, ptr addrspace(3) %89, align 8, !dbg !26
|
130 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !26
|
131 |
+
%102 = load i64, ptr addrspace(3) %92, align 8, !dbg !26
|
132 |
+
%103 = inttoptr i64 %102 to ptr addrspace(1), !dbg !26
|
133 |
+
%104 = load i64, ptr addrspace(3) %95, align 8, !dbg !26
|
134 |
+
%105 = inttoptr i64 %104 to ptr addrspace(1), !dbg !26
|
135 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 -1082130432, ptr addrspace(1) %94, i1 true) #2, !dbg !26
|
136 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 -1082130432, ptr addrspace(1) %97, i1 true) #2, !dbg !26
|
137 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 -1082130432, ptr addrspace(1) %103, i1 true) #2, !dbg !26
|
138 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 -1082130432, ptr addrspace(1) %105, i1 true) #2, !dbg !26
|
139 |
+
ret void, !dbg !27
|
140 |
+
}
|
141 |
+
|
142 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
143 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
144 |
+
|
145 |
+
; Function Attrs: convergent nocallback nounwind
|
146 |
+
declare void @llvm.nvvm.barrier0() #1
|
147 |
+
|
148 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
149 |
+
attributes #1 = { convergent nocallback nounwind }
|
150 |
+
attributes #2 = { nounwind }
|
151 |
+
|
152 |
+
!llvm.module.flags = !{!0, !1}
|
153 |
+
!llvm.dbg.cu = !{!2}
|
154 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
155 |
+
!llvm.ident = !{!6}
|
156 |
+
|
157 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
158 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
159 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
160 |
+
!3 = !DIFile(filename: "chlrkgpvvbdizdz7sllquet2j7zhtes6meh6kenrqxov26mswvw7.py", directory: "/tmp/torchinductor_root/hl")
|
161 |
+
!4 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
|
162 |
+
!5 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
|
163 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
164 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
165 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
166 |
+
!9 = !{}
|
167 |
+
!10 = !DILocation(line: 21, column: 36, scope: !7)
|
168 |
+
!11 = !DILocation(line: 20, column: 28, scope: !7)
|
169 |
+
!12 = !DILocation(line: 20, column: 34, scope: !7)
|
170 |
+
!13 = !DILocation(line: 20, column: 46, scope: !7)
|
171 |
+
!14 = !DILocation(line: 21, column: 23, scope: !7)
|
172 |
+
!15 = !DILocation(line: 24, column: 30, scope: !7)
|
173 |
+
!16 = !DILocation(line: 24, column: 35, scope: !7)
|
174 |
+
!17 = !DILocation(line: 26, column: 19, scope: !7)
|
175 |
+
!18 = !DILocation(line: 28, column: 32, scope: !7)
|
176 |
+
!19 = !DILocation(line: 29, column: 18, scope: !7)
|
177 |
+
!20 = !DILocation(line: 30, column: 18, scope: !7)
|
178 |
+
!21 = !DILocation(line: 31, column: 32, scope: !7)
|
179 |
+
!22 = !DILocation(line: 32, column: 36, scope: !7)
|
180 |
+
!23 = !DILocation(line: 32, column: 51, scope: !7)
|
181 |
+
!24 = !DILocation(line: 34, column: 39, scope: !7)
|
182 |
+
!25 = !DILocation(line: 34, column: 25, scope: !7)
|
183 |
+
!26 = !DILocation(line: 34, column: 51, scope: !7)
|
184 |
+
!27 = !DILocation(line: 34, column: 4, scope: !7)
|
.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.cubin
ADDED
Binary file (5.54 kB). View file
|
|
.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ttir
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%c1024_i32 = arith.constant 1024 : i32
|
4 |
+
%0 = tt.get_program_id x : i32
|
5 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
6 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
|
7 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32>
|
8 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32>
|
9 |
+
%5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
|
10 |
+
%6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
|
11 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32>
|
12 |
+
%8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
|
13 |
+
%9 = tt.addptr %8, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
|
14 |
+
%10 = arith.truncf %7 : tensor<1024xf32> to tensor<1024xbf16>
|
15 |
+
tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16>
|
16 |
+
tt.return
|
17 |
+
}
|
18 |
+
}
|
.triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.cubin
ADDED
Binary file (10.5 kB). View file
|
|
.triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.llir
ADDED
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
5 |
+
|
6 |
+
define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !7 {
|
7 |
+
%3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
8 |
+
%4 = shl i32 %3, 1, !dbg !10
|
9 |
+
%5 = and i32 %4, 510, !dbg !10
|
10 |
+
%6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #4, !dbg !11
|
11 |
+
%7 = shl i32 %6, 9, !dbg !12
|
12 |
+
%8 = or i32 %7, %5, !dbg !13
|
13 |
+
%9 = sext i32 %8 to i64, !dbg !14
|
14 |
+
%10 = getelementptr i16, ptr addrspace(1) %0, i64 %9, !dbg !14
|
15 |
+
%11 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %10, i1 true) #4, !dbg !15
|
16 |
+
%12 = trunc i32 %11 to i16, !dbg !15
|
17 |
+
%extelt.offset = lshr i32 %11, 16, !dbg !15
|
18 |
+
%13 = trunc i32 %extelt.offset to i16, !dbg !15
|
19 |
+
%14 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %12) #4, !dbg !16
|
20 |
+
%15 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %13) #4, !dbg !16
|
21 |
+
%16 = fmul float %14, 0x3FE6A09E60000000, !dbg !17
|
22 |
+
%17 = fmul float %15, 0x3FE6A09E60000000, !dbg !17
|
23 |
+
%18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
24 |
+
%.not.i = icmp eq i32 %18, 0, !dbg !18
|
25 |
+
%19 = tail call float @llvm.nvvm.fabs.ftz.f(float %16) #4, !dbg !18
|
26 |
+
%20 = tail call float @llvm.nvvm.fabs.f(float %16) #4, !dbg !18
|
27 |
+
%.0.i = select i1 %.not.i, float %20, float %19, !dbg !18
|
28 |
+
%21 = fcmp oge float %.0.i, 0x3FF00C1FC0000000, !dbg !18
|
29 |
+
br i1 %21, label %__nv_fabsf.exit1.i, label %23, !dbg !18
|
30 |
+
|
31 |
+
__nv_fabsf.exit1.i: ; preds = %2
|
32 |
+
%22 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
33 |
+
%.not1.i = icmp eq i32 %22, 0, !dbg !18
|
34 |
+
%.01.i = select i1 %.not1.i, float %20, float %19, !dbg !18
|
35 |
+
br label %__internal_fmad.exit.i, !dbg !18
|
36 |
+
|
37 |
+
23: ; preds = %2
|
38 |
+
%24 = fmul float %16, %16, !dbg !18
|
39 |
+
br label %__internal_fmad.exit.i, !dbg !18
|
40 |
+
|
41 |
+
__internal_fmad.exit.i: ; preds = %23, %__nv_fabsf.exit1.i
|
42 |
+
%25 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i ], [ 0x3FC06EBA60000000, %23 ], !dbg !18
|
43 |
+
%26 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i ], [ 0xBFD8127580000000, %23 ], !dbg !18
|
44 |
+
%27 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i ], [ 0x3FBCE315E0000000, %23 ], !dbg !18
|
45 |
+
%28 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i ], [ 0xBF9B837CE0000000, %23 ], !dbg !18
|
46 |
+
%29 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i ], [ 0x3F755ABD40000000, %23 ], !dbg !18
|
47 |
+
%30 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i ], [ 0xBF4AE9A400000000, %23 ], !dbg !18
|
48 |
+
%31 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i ], [ 0x3F163D2D40000000, %23 ], !dbg !18
|
49 |
+
%32 = phi float [ %.01.i, %__nv_fabsf.exit1.i ], [ %24, %23 ], !dbg !18
|
50 |
+
%33 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
51 |
+
%.not2.i = icmp eq i32 %33, 0, !dbg !18
|
52 |
+
%34 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %31, float %32, float %30) #4, !dbg !18
|
53 |
+
%35 = tail call float @llvm.nvvm.fma.rn.f(float %31, float %32, float %30) #4, !dbg !18
|
54 |
+
%.02.i = select i1 %.not2.i, float %35, float %34, !dbg !18
|
55 |
+
%36 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
56 |
+
%.not3.i = icmp eq i32 %36, 0, !dbg !18
|
57 |
+
%37 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float %32, float %29) #4, !dbg !18
|
58 |
+
%38 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float %32, float %29) #4, !dbg !18
|
59 |
+
%.03.i = select i1 %.not3.i, float %38, float %37, !dbg !18
|
60 |
+
%39 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
61 |
+
%.not4.i = icmp eq i32 %39, 0, !dbg !18
|
62 |
+
%40 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i, float %32, float %28) #4, !dbg !18
|
63 |
+
%41 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i, float %32, float %28) #4, !dbg !18
|
64 |
+
%.04.i = select i1 %.not4.i, float %41, float %40, !dbg !18
|
65 |
+
%42 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
66 |
+
%.not5.i = icmp eq i32 %42, 0, !dbg !18
|
67 |
+
%43 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i, float %32, float %27) #4, !dbg !18
|
68 |
+
%44 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i, float %32, float %27) #4, !dbg !18
|
69 |
+
%.05.i = select i1 %.not5.i, float %44, float %43, !dbg !18
|
70 |
+
%45 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
71 |
+
%.not6.i = icmp eq i32 %45, 0, !dbg !18
|
72 |
+
%46 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %32, float %26) #4, !dbg !18
|
73 |
+
%47 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %32, float %26) #4, !dbg !18
|
74 |
+
%.06.i = select i1 %.not6.i, float %47, float %46, !dbg !18
|
75 |
+
%48 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
76 |
+
%.not7.i = icmp eq i32 %48, 0, !dbg !18
|
77 |
+
%49 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i, float %32, float %25) #4, !dbg !18
|
78 |
+
%50 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i, float %32, float %25) #4, !dbg !18
|
79 |
+
%.07.i = select i1 %.not7.i, float %50, float %49, !dbg !18
|
80 |
+
%51 = fneg float %32, !dbg !18
|
81 |
+
%52 = select i1 %21, float %51, float %16, !dbg !18
|
82 |
+
%53 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
83 |
+
%.not8.i = icmp eq i32 %53, 0, !dbg !18
|
84 |
+
%54 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %52, float %52) #4, !dbg !18
|
85 |
+
%55 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %52, float %52) #4, !dbg !18
|
86 |
+
%.08.i = select i1 %.not8.i, float %55, float %54, !dbg !18
|
87 |
+
br i1 %21, label %56, label %__nv_erff.exit, !dbg !18
|
88 |
+
|
89 |
+
56: ; preds = %__internal_fmad.exit.i
|
90 |
+
%57 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i) #4, !dbg !18
|
91 |
+
%58 = fsub float 1.000000e+00, %57, !dbg !18
|
92 |
+
%59 = bitcast float %58 to i32, !dbg !18
|
93 |
+
%60 = bitcast float %16 to i32, !dbg !18
|
94 |
+
%61 = and i32 %60, -2147483648, !dbg !18
|
95 |
+
%62 = or i32 %61, %59, !dbg !18
|
96 |
+
%63 = bitcast i32 %62 to float, !dbg !18
|
97 |
+
br label %__nv_erff.exit, !dbg !18
|
98 |
+
|
99 |
+
__nv_erff.exit: ; preds = %__internal_fmad.exit.i, %56
|
100 |
+
%r.0.i = phi float [ %63, %56 ], [ %.08.i, %__internal_fmad.exit.i ], !dbg !18
|
101 |
+
%64 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
102 |
+
%.not.i1 = icmp eq i32 %64, 0, !dbg !18
|
103 |
+
%65 = tail call float @llvm.nvvm.fabs.ftz.f(float %17) #4, !dbg !18
|
104 |
+
%66 = tail call float @llvm.nvvm.fabs.f(float %17) #4, !dbg !18
|
105 |
+
%.0.i2 = select i1 %.not.i1, float %66, float %65, !dbg !18
|
106 |
+
%67 = fcmp oge float %.0.i2, 0x3FF00C1FC0000000, !dbg !18
|
107 |
+
br i1 %67, label %__nv_fabsf.exit1.i19, label %69, !dbg !18
|
108 |
+
|
109 |
+
__nv_fabsf.exit1.i19: ; preds = %__nv_erff.exit
|
110 |
+
%68 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
111 |
+
%.not1.i20 = icmp eq i32 %68, 0, !dbg !18
|
112 |
+
%.01.i21 = select i1 %.not1.i20, float %66, float %65, !dbg !18
|
113 |
+
br label %__internal_fmad.exit.i3, !dbg !18
|
114 |
+
|
115 |
+
69: ; preds = %__nv_erff.exit
|
116 |
+
%70 = fmul float %17, %17, !dbg !18
|
117 |
+
br label %__internal_fmad.exit.i3, !dbg !18
|
118 |
+
|
119 |
+
__internal_fmad.exit.i3: ; preds = %69, %__nv_fabsf.exit1.i19
|
120 |
+
%71 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i19 ], [ 0x3FC06EBA60000000, %69 ], !dbg !18
|
121 |
+
%72 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i19 ], [ 0xBFD8127580000000, %69 ], !dbg !18
|
122 |
+
%73 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i19 ], [ 0x3FBCE315E0000000, %69 ], !dbg !18
|
123 |
+
%74 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i19 ], [ 0xBF9B837CE0000000, %69 ], !dbg !18
|
124 |
+
%75 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i19 ], [ 0x3F755ABD40000000, %69 ], !dbg !18
|
125 |
+
%76 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i19 ], [ 0xBF4AE9A400000000, %69 ], !dbg !18
|
126 |
+
%77 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i19 ], [ 0x3F163D2D40000000, %69 ], !dbg !18
|
127 |
+
%78 = phi float [ %.01.i21, %__nv_fabsf.exit1.i19 ], [ %70, %69 ], !dbg !18
|
128 |
+
%79 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
129 |
+
%.not2.i4 = icmp eq i32 %79, 0, !dbg !18
|
130 |
+
%80 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %77, float %78, float %76) #4, !dbg !18
|
131 |
+
%81 = tail call float @llvm.nvvm.fma.rn.f(float %77, float %78, float %76) #4, !dbg !18
|
132 |
+
%.02.i5 = select i1 %.not2.i4, float %81, float %80, !dbg !18
|
133 |
+
%82 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
134 |
+
%.not3.i6 = icmp eq i32 %82, 0, !dbg !18
|
135 |
+
%83 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i5, float %78, float %75) #4, !dbg !18
|
136 |
+
%84 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i5, float %78, float %75) #4, !dbg !18
|
137 |
+
%.03.i7 = select i1 %.not3.i6, float %84, float %83, !dbg !18
|
138 |
+
%85 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
139 |
+
%.not4.i8 = icmp eq i32 %85, 0, !dbg !18
|
140 |
+
%86 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i7, float %78, float %74) #4, !dbg !18
|
141 |
+
%87 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i7, float %78, float %74) #4, !dbg !18
|
142 |
+
%.04.i9 = select i1 %.not4.i8, float %87, float %86, !dbg !18
|
143 |
+
%88 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
144 |
+
%.not5.i10 = icmp eq i32 %88, 0, !dbg !18
|
145 |
+
%89 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i9, float %78, float %73) #4, !dbg !18
|
146 |
+
%90 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i9, float %78, float %73) #4, !dbg !18
|
147 |
+
%.05.i11 = select i1 %.not5.i10, float %90, float %89, !dbg !18
|
148 |
+
%91 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
149 |
+
%.not6.i12 = icmp eq i32 %91, 0, !dbg !18
|
150 |
+
%92 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i11, float %78, float %72) #4, !dbg !18
|
151 |
+
%93 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i11, float %78, float %72) #4, !dbg !18
|
152 |
+
%.06.i13 = select i1 %.not6.i12, float %93, float %92, !dbg !18
|
153 |
+
%94 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
154 |
+
%.not7.i14 = icmp eq i32 %94, 0, !dbg !18
|
155 |
+
%95 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i13, float %78, float %71) #4, !dbg !18
|
156 |
+
%96 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i13, float %78, float %71) #4, !dbg !18
|
157 |
+
%.07.i15 = select i1 %.not7.i14, float %96, float %95, !dbg !18
|
158 |
+
%97 = fneg float %78, !dbg !18
|
159 |
+
%98 = select i1 %67, float %97, float %17, !dbg !18
|
160 |
+
%99 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
161 |
+
%.not8.i16 = icmp eq i32 %99, 0, !dbg !18
|
162 |
+
%100 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i15, float %98, float %98) #4, !dbg !18
|
163 |
+
%101 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i15, float %98, float %98) #4, !dbg !18
|
164 |
+
%.08.i17 = select i1 %.not8.i16, float %101, float %100, !dbg !18
|
165 |
+
br i1 %67, label %102, label %__nv_erff.exit22, !dbg !18
|
166 |
+
|
167 |
+
102: ; preds = %__internal_fmad.exit.i3
|
168 |
+
%103 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i17) #4, !dbg !18
|
169 |
+
%104 = fsub float 1.000000e+00, %103, !dbg !18
|
170 |
+
%105 = bitcast float %104 to i32, !dbg !18
|
171 |
+
%106 = bitcast float %17 to i32, !dbg !18
|
172 |
+
%107 = and i32 %106, -2147483648, !dbg !18
|
173 |
+
%108 = or i32 %107, %105, !dbg !18
|
174 |
+
%109 = bitcast i32 %108 to float, !dbg !18
|
175 |
+
br label %__nv_erff.exit22, !dbg !18
|
176 |
+
|
177 |
+
__nv_erff.exit22: ; preds = %__internal_fmad.exit.i3, %102
|
178 |
+
%r.0.i18 = phi float [ %109, %102 ], [ %.08.i17, %__internal_fmad.exit.i3 ], !dbg !18
|
179 |
+
%110 = fmul float %15, 5.000000e-01, !dbg !19
|
180 |
+
%111 = fmul float %14, 5.000000e-01, !dbg !19
|
181 |
+
%112 = fadd float %r.0.i, 1.000000e+00, !dbg !20
|
182 |
+
%113 = fadd float %r.0.i18, 1.000000e+00, !dbg !20
|
183 |
+
%114 = fmul float %111, %112, !dbg !21
|
184 |
+
%115 = fmul float %110, %113, !dbg !21
|
185 |
+
%116 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %114) #4, !dbg !22
|
186 |
+
%117 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %115) #4, !dbg !22
|
187 |
+
%118 = insertelement <2 x i16> undef, i16 %116, i64 0, !dbg !22
|
188 |
+
%119 = insertelement <2 x i16> %118, i16 %117, i64 1, !dbg !22
|
189 |
+
%120 = bitcast <2 x i16> %119 to i32, !dbg !22
|
190 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %120, ptr addrspace(1) %10, i1 true) #4, !dbg !22
|
191 |
+
ret void, !dbg !23
|
192 |
+
}
|
193 |
+
|
194 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
195 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
196 |
+
|
197 |
+
; Function Attrs: alwaysinline nounwind
|
198 |
+
define float @__nv_erff(float %a) local_unnamed_addr #1 {
|
199 |
+
__nv_fabsf.exit:
|
200 |
+
%0 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
201 |
+
%.not = icmp eq i32 %0, 0
|
202 |
+
%1 = tail call float @llvm.nvvm.fabs.ftz.f(float %a) #4
|
203 |
+
%2 = tail call float @llvm.nvvm.fabs.f(float %a) #4
|
204 |
+
%.0 = select i1 %.not, float %2, float %1
|
205 |
+
%3 = fcmp oge float %.0, 0x3FF00C1FC0000000
|
206 |
+
br i1 %3, label %__nv_fabsf.exit1, label %5
|
207 |
+
|
208 |
+
__nv_fabsf.exit1: ; preds = %__nv_fabsf.exit
|
209 |
+
%4 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
210 |
+
%.not1 = icmp eq i32 %4, 0
|
211 |
+
%.01 = select i1 %.not1, float %2, float %1
|
212 |
+
br label %__internal_fmad.exit
|
213 |
+
|
214 |
+
5: ; preds = %__nv_fabsf.exit
|
215 |
+
%6 = fmul float %a, %a
|
216 |
+
br label %__internal_fmad.exit
|
217 |
+
|
218 |
+
__internal_fmad.exit: ; preds = %5, %__nv_fabsf.exit1
|
219 |
+
%7 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1 ], [ 0x3FC06EBA60000000, %5 ]
|
220 |
+
%8 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1 ], [ 0xBFD8127580000000, %5 ]
|
221 |
+
%9 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1 ], [ 0x3FBCE315E0000000, %5 ]
|
222 |
+
%10 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1 ], [ 0xBF9B837CE0000000, %5 ]
|
223 |
+
%11 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1 ], [ 0x3F755ABD40000000, %5 ]
|
224 |
+
%12 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1 ], [ 0xBF4AE9A400000000, %5 ]
|
225 |
+
%13 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1 ], [ 0x3F163D2D40000000, %5 ]
|
226 |
+
%14 = phi float [ %.01, %__nv_fabsf.exit1 ], [ %6, %5 ]
|
227 |
+
%15 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
228 |
+
%.not2 = icmp eq i32 %15, 0
|
229 |
+
%16 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %13, float %14, float %12) #4
|
230 |
+
%17 = tail call float @llvm.nvvm.fma.rn.f(float %13, float %14, float %12) #4
|
231 |
+
%.02 = select i1 %.not2, float %17, float %16
|
232 |
+
%18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
233 |
+
%.not3 = icmp eq i32 %18, 0
|
234 |
+
%19 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02, float %14, float %11) #4
|
235 |
+
%20 = tail call float @llvm.nvvm.fma.rn.f(float %.02, float %14, float %11) #4
|
236 |
+
%.03 = select i1 %.not3, float %20, float %19
|
237 |
+
%21 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
238 |
+
%.not4 = icmp eq i32 %21, 0
|
239 |
+
%22 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03, float %14, float %10) #4
|
240 |
+
%23 = tail call float @llvm.nvvm.fma.rn.f(float %.03, float %14, float %10) #4
|
241 |
+
%.04 = select i1 %.not4, float %23, float %22
|
242 |
+
%24 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
243 |
+
%.not5 = icmp eq i32 %24, 0
|
244 |
+
%25 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04, float %14, float %9) #4
|
245 |
+
%26 = tail call float @llvm.nvvm.fma.rn.f(float %.04, float %14, float %9) #4
|
246 |
+
%.05 = select i1 %.not5, float %26, float %25
|
247 |
+
%27 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
248 |
+
%.not6 = icmp eq i32 %27, 0
|
249 |
+
%28 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05, float %14, float %8) #4
|
250 |
+
%29 = tail call float @llvm.nvvm.fma.rn.f(float %.05, float %14, float %8) #4
|
251 |
+
%.06 = select i1 %.not6, float %29, float %28
|
252 |
+
%30 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
253 |
+
%.not7 = icmp eq i32 %30, 0
|
254 |
+
%31 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06, float %14, float %7) #4
|
255 |
+
%32 = tail call float @llvm.nvvm.fma.rn.f(float %.06, float %14, float %7) #4
|
256 |
+
%.07 = select i1 %.not7, float %32, float %31
|
257 |
+
%33 = fneg float %14
|
258 |
+
%34 = select i1 %3, float %33, float %a
|
259 |
+
%35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
260 |
+
%.not8 = icmp eq i32 %35, 0
|
261 |
+
%36 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07, float %34, float %34) #4
|
262 |
+
%37 = tail call float @llvm.nvvm.fma.rn.f(float %.07, float %34, float %34) #4
|
263 |
+
%.08 = select i1 %.not8, float %37, float %36
|
264 |
+
br i1 %3, label %38, label %46
|
265 |
+
|
266 |
+
38: ; preds = %__internal_fmad.exit
|
267 |
+
%39 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08) #4
|
268 |
+
%40 = fsub float 1.000000e+00, %39
|
269 |
+
%41 = bitcast float %40 to i32
|
270 |
+
%42 = bitcast float %a to i32
|
271 |
+
%43 = and i32 %42, -2147483648
|
272 |
+
%44 = or i32 %43, %41
|
273 |
+
%45 = bitcast i32 %44 to float
|
274 |
+
br label %46
|
275 |
+
|
276 |
+
46: ; preds = %38, %__internal_fmad.exit
|
277 |
+
%r.0 = phi float [ %45, %38 ], [ %.08, %__internal_fmad.exit ]
|
278 |
+
ret float %r.0
|
279 |
+
}
|
280 |
+
|
281 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #2
|
282 |
+
|
283 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
284 |
+
declare float @llvm.nvvm.fabs.ftz.f(float) #0
|
285 |
+
|
286 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
287 |
+
declare float @llvm.nvvm.fabs.f(float) #0
|
288 |
+
|
289 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
290 |
+
declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #0
|
291 |
+
|
292 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
293 |
+
declare float @llvm.nvvm.fma.rn.f(float, float, float) #0
|
294 |
+
|
295 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
296 |
+
declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
|
297 |
+
|
298 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
299 |
+
attributes #1 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
300 |
+
attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
301 |
+
attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
302 |
+
attributes #4 = { nounwind }
|
303 |
+
|
304 |
+
!llvm.module.flags = !{!0, !1}
|
305 |
+
!llvm.dbg.cu = !{!2}
|
306 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
307 |
+
!llvm.ident = !{!6}
|
308 |
+
|
309 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
310 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
311 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
312 |
+
!3 = !DIFile(filename: "ckphrtdpgsxl7sfarkkzylhv4st3uhmzvg3u6z5excfp6ydybq74.py", directory: "/tmp/torchinductor_root/kp")
|
313 |
+
!4 = !{ptr @triton__0d1de, !"kernel", i32 1}
|
314 |
+
!5 = !{ptr @triton__0d1de, !"maxntidx", i32 256}
|
315 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
316 |
+
!7 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
317 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
318 |
+
!9 = !{}
|
319 |
+
!10 = !DILocation(line: 21, column: 36, scope: !7)
|
320 |
+
!11 = !DILocation(line: 20, column: 28, scope: !7)
|
321 |
+
!12 = !DILocation(line: 20, column: 33, scope: !7)
|
322 |
+
!13 = !DILocation(line: 21, column: 23, scope: !7)
|
323 |
+
!14 = !DILocation(line: 24, column: 34, scope: !7)
|
324 |
+
!15 = !DILocation(line: 24, column: 39, scope: !7)
|
325 |
+
!16 = !DILocation(line: 24, column: 48, scope: !7)
|
326 |
+
!17 = !DILocation(line: 29, column: 18, scope: !7)
|
327 |
+
!18 = !DILocation(line: 30, column: 23, scope: !7)
|
328 |
+
!19 = !DILocation(line: 27, column: 18, scope: !7)
|
329 |
+
!20 = !DILocation(line: 32, column: 18, scope: !7)
|
330 |
+
!21 = !DILocation(line: 33, column: 18, scope: !7)
|
331 |
+
!22 = !DILocation(line: 35, column: 40, scope: !7)
|
332 |
+
!23 = !DILocation(line: 35, column: 4, scope: !7)
|
.triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.ttir
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<1.000000e+00> : tensor<512xf32>
|
4 |
+
%cst_0 = arith.constant dense<0.707106769> : tensor<512xf32>
|
5 |
+
%cst_1 = arith.constant dense<5.000000e-01> : tensor<512xf32>
|
6 |
+
%c512_i32 = arith.constant 512 : i32
|
7 |
+
%0 = tt.get_program_id x : i32
|
8 |
+
%1 = arith.muli %0, %c512_i32 : i32
|
9 |
+
%2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
|
10 |
+
%3 = tt.splat %1 : (i32) -> tensor<512xi32>
|
11 |
+
%4 = arith.addi %3, %2 : tensor<512xi32>
|
12 |
+
%5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
|
13 |
+
%6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
|
14 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16>
|
15 |
+
%8 = arith.extf %7 : tensor<512xbf16> to tensor<512xf32>
|
16 |
+
%9 = arith.mulf %8, %cst_1 : tensor<512xf32>
|
17 |
+
%10 = arith.mulf %8, %cst_0 : tensor<512xf32>
|
18 |
+
%11 = tt.extern_elementwise %10 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<512xf32>) -> tensor<512xf32>
|
19 |
+
%12 = arith.addf %11, %cst : tensor<512xf32>
|
20 |
+
%13 = arith.mulf %9, %12 : tensor<512xf32>
|
21 |
+
%14 = arith.truncf %13 : tensor<512xf32> to tensor<512xbf16>
|
22 |
+
tt.store %6, %14 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16>
|
23 |
+
tt.return
|
24 |
+
}
|
25 |
+
}
|
.triton/dump/246118bec10f09cdce32d0be7c22b5ae/triton_.ptx
ADDED
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1de
|
10 |
+
|
11 |
+
.visible .entry triton__0d1de(
|
12 |
+
.param .u64 triton__0d1de_param_0,
|
13 |
+
.param .u64 triton__0d1de_param_1
|
14 |
+
)
|
15 |
+
.maxntid 256, 1, 1
|
16 |
+
{
|
17 |
+
.reg .pred %p<2>;
|
18 |
+
.reg .b32 %r<7>;
|
19 |
+
.reg .b64 %rd<7>;
|
20 |
+
.loc 1 18 0
|
21 |
+
$L__func_begin0:
|
22 |
+
.loc 1 18 0
|
23 |
+
|
24 |
+
ld.param.u64 %rd2, [triton__0d1de_param_0];
|
25 |
+
$L__tmp0:
|
26 |
+
.loc 1 21 36
|
27 |
+
mov.u32 %r4, %tid.x;
|
28 |
+
shl.b32 %r5, %r4, 1;
|
29 |
+
and.b32 %r6, %r5, 510;
|
30 |
+
.loc 1 20 28
|
31 |
+
mov.u32 %r1, %ctaid.x;
|
32 |
+
.loc 1 20 46
|
33 |
+
mul.wide.s32 %rd3, %r1, 512;
|
34 |
+
cvt.u64.u32 %rd4, %r6;
|
35 |
+
.loc 1 21 23
|
36 |
+
or.b64 %rd5, %rd3, %rd4;
|
37 |
+
.loc 1 25 25
|
38 |
+
shl.b64 %rd6, %rd5, 2;
|
39 |
+
add.s64 %rd1, %rd2, %rd6;
|
40 |
+
mov.b32 %r2, 0;
|
41 |
+
mov.pred %p1, -1;
|
42 |
+
.loc 1 25 36
|
43 |
+
@%p1 st.global.v2.b32 [ %rd1 + 0 ], { %r2, %r2 };
|
44 |
+
.loc 1 25 4
|
45 |
+
ret;
|
46 |
+
$L__tmp1:
|
47 |
+
$L__func_end0:
|
48 |
+
|
49 |
+
}
|
50 |
+
.file 1 "/tmp/torchinductor_root/pk/cpkw3bdoamlgzvqjeyuk34b3jcjf57htisara7lukflexo3t22ew.py"
|
51 |
+
.section .debug_abbrev
|
52 |
+
{
|
53 |
+
.b8 1
|
54 |
+
.b8 17
|
55 |
+
.b8 1
|
56 |
+
.b8 37
|
57 |
+
.b8 8
|
58 |
+
.b8 19
|
59 |
+
.b8 5
|
60 |
+
.b8 3
|
61 |
+
.b8 8
|
62 |
+
.b8 16
|
63 |
+
.b8 6
|
64 |
+
.b8 27
|
65 |
+
.b8 8
|
66 |
+
.b8 180
|
67 |
+
.b8 66
|
68 |
+
.b8 12
|
69 |
+
.b8 17
|
70 |
+
.b8 1
|
71 |
+
.b8 18
|
72 |
+
.b8 1
|
73 |
+
.b8 0
|
74 |
+
.b8 0
|
75 |
+
.b8 2
|
76 |
+
.b8 46
|
77 |
+
.b8 0
|
78 |
+
.b8 17
|
79 |
+
.b8 1
|
80 |
+
.b8 18
|
81 |
+
.b8 1
|
82 |
+
.b8 64
|
83 |
+
.b8 10
|
84 |
+
.b8 135
|
85 |
+
.b8 64
|
86 |
+
.b8 8
|
87 |
+
.b8 3
|
88 |
+
.b8 8
|
89 |
+
.b8 58
|
90 |
+
.b8 11
|
91 |
+
.b8 59
|
92 |
+
.b8 11
|
93 |
+
.b8 63
|
94 |
+
.b8 12
|
95 |
+
.b8 0
|
96 |
+
.b8 0
|
97 |
+
.b8 0
|
98 |
+
}
|
99 |
+
.section .debug_info
|
100 |
+
{
|
101 |
+
.b32 172
|
102 |
+
.b8 2
|
103 |
+
.b8 0
|
104 |
+
.b32 .debug_abbrev
|
105 |
+
.b8 8
|
106 |
+
.b8 1
|
107 |
+
.b8 116
|
108 |
+
.b8 114
|
109 |
+
.b8 105
|
110 |
+
.b8 116
|
111 |
+
.b8 111
|
112 |
+
.b8 110
|
113 |
+
.b8 0
|
114 |
+
.b8 2
|
115 |
+
.b8 0
|
116 |
+
.b8 99
|
117 |
+
.b8 112
|
118 |
+
.b8 107
|
119 |
+
.b8 119
|
120 |
+
.b8 51
|
121 |
+
.b8 98
|
122 |
+
.b8 100
|
123 |
+
.b8 111
|
124 |
+
.b8 97
|
125 |
+
.b8 109
|
126 |
+
.b8 108
|
127 |
+
.b8 103
|
128 |
+
.b8 122
|
129 |
+
.b8 118
|
130 |
+
.b8 113
|
131 |
+
.b8 106
|
132 |
+
.b8 101
|
133 |
+
.b8 121
|
134 |
+
.b8 117
|
135 |
+
.b8 107
|
136 |
+
.b8 51
|
137 |
+
.b8 52
|
138 |
+
.b8 98
|
139 |
+
.b8 51
|
140 |
+
.b8 106
|
141 |
+
.b8 99
|
142 |
+
.b8 106
|
143 |
+
.b8 102
|
144 |
+
.b8 53
|
145 |
+
.b8 55
|
146 |
+
.b8 104
|
147 |
+
.b8 116
|
148 |
+
.b8 105
|
149 |
+
.b8 115
|
150 |
+
.b8 97
|
151 |
+
.b8 114
|
152 |
+
.b8 97
|
153 |
+
.b8 55
|
154 |
+
.b8 108
|
155 |
+
.b8 117
|
156 |
+
.b8 107
|
157 |
+
.b8 102
|
158 |
+
.b8 108
|
159 |
+
.b8 101
|
160 |
+
.b8 120
|
161 |
+
.b8 111
|
162 |
+
.b8 51
|
163 |
+
.b8 116
|
164 |
+
.b8 50
|
165 |
+
.b8 50
|
166 |
+
.b8 101
|
167 |
+
.b8 119
|
168 |
+
.b8 46
|
169 |
+
.b8 112
|
170 |
+
.b8 121
|
171 |
+
.b8 0
|
172 |
+
.b32 .debug_line
|
173 |
+
.b8 47
|
174 |
+
.b8 116
|
175 |
+
.b8 109
|
176 |
+
.b8 112
|
177 |
+
.b8 47
|
178 |
+
.b8 116
|
179 |
+
.b8 111
|
180 |
+
.b8 114
|
181 |
+
.b8 99
|
182 |
+
.b8 104
|
183 |
+
.b8 105
|
184 |
+
.b8 110
|
185 |
+
.b8 100
|
186 |
+
.b8 117
|
187 |
+
.b8 99
|
188 |
+
.b8 116
|
189 |
+
.b8 111
|
190 |
+
.b8 114
|
191 |
+
.b8 95
|
192 |
+
.b8 114
|
193 |
+
.b8 111
|
194 |
+
.b8 111
|
195 |
+
.b8 116
|
196 |
+
.b8 47
|
197 |
+
.b8 112
|
198 |
+
.b8 107
|
199 |
+
.b8 0
|
200 |
+
.b8 1
|
201 |
+
.b64 $L__func_begin0
|
202 |
+
.b64 $L__func_end0
|
203 |
+
.b8 2
|
204 |
+
.b64 $L__func_begin0
|
205 |
+
.b64 $L__func_end0
|
206 |
+
.b8 1
|
207 |
+
.b8 156
|
208 |
+
.b8 116
|
209 |
+
.b8 114
|
210 |
+
.b8 105
|
211 |
+
.b8 116
|
212 |
+
.b8 111
|
213 |
+
.b8 110
|
214 |
+
.b8 95
|
215 |
+
.b8 95
|
216 |
+
.b8 48
|
217 |
+
.b8 100
|
218 |
+
.b8 49
|
219 |
+
.b8 100
|
220 |
+
.b8 101
|
221 |
+
.b8 0
|
222 |
+
.b8 116
|
223 |
+
.b8 114
|
224 |
+
.b8 105
|
225 |
+
.b8 116
|
226 |
+
.b8 111
|
227 |
+
.b8 110
|
228 |
+
.b8 95
|
229 |
+
.b8 95
|
230 |
+
.b8 48
|
231 |
+
.b8 100
|
232 |
+
.b8 49
|
233 |
+
.b8 100
|
234 |
+
.b8 101
|
235 |
+
.b8 0
|
236 |
+
.b8 1
|
237 |
+
.b8 18
|
238 |
+
.b8 1
|
239 |
+
.b8 0
|
240 |
+
}
|
241 |
+
.section .debug_pubnames
|
242 |
+
{
|
243 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
244 |
+
$L__pubNames_start0:
|
245 |
+
.b8 2
|
246 |
+
.b8 0
|
247 |
+
.b32 .debug_info
|
248 |
+
.b32 176
|
249 |
+
.b32 125
|
250 |
+
.b8 116
|
251 |
+
.b8 114
|
252 |
+
.b8 105
|
253 |
+
.b8 116
|
254 |
+
.b8 111
|
255 |
+
.b8 110
|
256 |
+
.b8 95
|
257 |
+
.b8 95
|
258 |
+
.b8 48
|
259 |
+
.b8 100
|
260 |
+
.b8 49
|
261 |
+
.b8 100
|
262 |
+
.b8 101
|
263 |
+
.b8 0
|
264 |
+
.b32 0
|
265 |
+
$L__pubNames_end0:
|
266 |
+
}
|
267 |
+
.section .debug_pubtypes
|
268 |
+
{
|
269 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
270 |
+
$L__pubTypes_start0:
|
271 |
+
.b8 2
|
272 |
+
.b8 0
|
273 |
+
.b32 .debug_info
|
274 |
+
.b32 176
|
275 |
+
.b32 0
|
276 |
+
$L__pubTypes_end0:
|
277 |
+
}
|
278 |
+
.section .debug_loc { }
|
.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.cubin
ADDED
Binary file (7.07 kB). View file
|
|
.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.llir
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
|
5 |
+
|
6 |
+
define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
|
7 |
+
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
8 |
+
%5 = and i32 %4, 127, !dbg !8
|
9 |
+
%6 = shl nuw nsw i32 %5, 3, !dbg !8
|
10 |
+
%7 = shl nuw nsw i32 %5, 2, !dbg !8
|
11 |
+
%8 = or i32 %7, 512, !dbg !8
|
12 |
+
%9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !9
|
13 |
+
%10 = shl i32 %9, 10, !dbg !10
|
14 |
+
%11 = or i32 %10, %6, !dbg !11
|
15 |
+
%12 = or i32 %10, %7, !dbg !11
|
16 |
+
%13 = or i32 %10, %8, !dbg !11
|
17 |
+
%14 = sext i32 %11 to i64, !dbg !12
|
18 |
+
%15 = getelementptr i16, ptr addrspace(1) %0, i64 %14, !dbg !12
|
19 |
+
%16 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %15, i1 true) #2, !dbg !13
|
20 |
+
%17 = extractvalue { i32, i32, i32, i32 } %16, 0, !dbg !13
|
21 |
+
%18 = extractvalue { i32, i32, i32, i32 } %16, 1, !dbg !13
|
22 |
+
%19 = extractvalue { i32, i32, i32, i32 } %16, 2, !dbg !13
|
23 |
+
%20 = extractvalue { i32, i32, i32, i32 } %16, 3, !dbg !13
|
24 |
+
%21 = trunc i32 %17 to i16, !dbg !13
|
25 |
+
%extelt.offset = lshr i32 %17, 16, !dbg !13
|
26 |
+
%22 = trunc i32 %extelt.offset to i16, !dbg !13
|
27 |
+
%23 = trunc i32 %18 to i16, !dbg !13
|
28 |
+
%extelt.offset1 = lshr i32 %18, 16, !dbg !13
|
29 |
+
%24 = trunc i32 %extelt.offset1 to i16, !dbg !13
|
30 |
+
%25 = trunc i32 %19 to i16, !dbg !13
|
31 |
+
%extelt.offset2 = lshr i32 %19, 16, !dbg !13
|
32 |
+
%26 = trunc i32 %extelt.offset2 to i16, !dbg !13
|
33 |
+
%27 = trunc i32 %20 to i16, !dbg !13
|
34 |
+
%extelt.offset3 = lshr i32 %20, 16, !dbg !13
|
35 |
+
%28 = trunc i32 %extelt.offset3 to i16, !dbg !13
|
36 |
+
%29 = zext nneg i32 %6 to i64, !dbg !14
|
37 |
+
%30 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %29, !dbg !14
|
38 |
+
%31 = insertelement <1 x i16> undef, i16 %21, i64 0, !dbg !14
|
39 |
+
store <1 x i16> %31, ptr addrspace(3) %30, align 2, !dbg !14
|
40 |
+
%32 = or i32 %6, 1, !dbg !14
|
41 |
+
%33 = zext nneg i32 %32 to i64, !dbg !14
|
42 |
+
%34 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %33, !dbg !14
|
43 |
+
%35 = insertelement <1 x i16> undef, i16 %22, i64 0, !dbg !14
|
44 |
+
store <1 x i16> %35, ptr addrspace(3) %34, align 2, !dbg !14
|
45 |
+
%36 = or i32 %6, 2, !dbg !14
|
46 |
+
%37 = zext nneg i32 %36 to i64, !dbg !14
|
47 |
+
%38 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %37, !dbg !14
|
48 |
+
%39 = insertelement <1 x i16> undef, i16 %23, i64 0, !dbg !14
|
49 |
+
store <1 x i16> %39, ptr addrspace(3) %38, align 2, !dbg !14
|
50 |
+
%40 = or i32 %6, 3, !dbg !14
|
51 |
+
%41 = zext nneg i32 %40 to i64, !dbg !14
|
52 |
+
%42 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %41, !dbg !14
|
53 |
+
%43 = insertelement <1 x i16> undef, i16 %24, i64 0, !dbg !14
|
54 |
+
store <1 x i16> %43, ptr addrspace(3) %42, align 2, !dbg !14
|
55 |
+
%44 = or i32 %6, 4, !dbg !14
|
56 |
+
%45 = zext nneg i32 %44 to i64, !dbg !14
|
57 |
+
%46 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %45, !dbg !14
|
58 |
+
%47 = insertelement <1 x i16> undef, i16 %25, i64 0, !dbg !14
|
59 |
+
store <1 x i16> %47, ptr addrspace(3) %46, align 2, !dbg !14
|
60 |
+
%48 = or i32 %6, 5, !dbg !14
|
61 |
+
%49 = zext nneg i32 %48 to i64, !dbg !14
|
62 |
+
%50 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %49, !dbg !14
|
63 |
+
%51 = insertelement <1 x i16> undef, i16 %26, i64 0, !dbg !14
|
64 |
+
store <1 x i16> %51, ptr addrspace(3) %50, align 2, !dbg !14
|
65 |
+
%52 = or i32 %6, 6, !dbg !14
|
66 |
+
%53 = zext nneg i32 %52 to i64, !dbg !14
|
67 |
+
%54 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %53, !dbg !14
|
68 |
+
%55 = insertelement <1 x i16> undef, i16 %27, i64 0, !dbg !14
|
69 |
+
store <1 x i16> %55, ptr addrspace(3) %54, align 2, !dbg !14
|
70 |
+
%56 = or i32 %6, 7, !dbg !14
|
71 |
+
%57 = zext nneg i32 %56 to i64, !dbg !14
|
72 |
+
%58 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %57, !dbg !14
|
73 |
+
%59 = insertelement <1 x i16> undef, i16 %28, i64 0, !dbg !14
|
74 |
+
store <1 x i16> %59, ptr addrspace(3) %58, align 2, !dbg !14
|
75 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !14
|
76 |
+
%60 = zext nneg i32 %7 to i64, !dbg !14
|
77 |
+
%61 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %60, !dbg !14
|
78 |
+
%62 = load i16, ptr addrspace(3) %61, align 2, !dbg !14
|
79 |
+
%63 = or i32 %7, 1, !dbg !14
|
80 |
+
%64 = zext nneg i32 %63 to i64, !dbg !14
|
81 |
+
%65 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %64, !dbg !14
|
82 |
+
%66 = load i16, ptr addrspace(3) %65, align 2, !dbg !14
|
83 |
+
%67 = or i32 %7, 2, !dbg !14
|
84 |
+
%68 = zext nneg i32 %67 to i64, !dbg !14
|
85 |
+
%69 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %68, !dbg !14
|
86 |
+
%70 = load i16, ptr addrspace(3) %69, align 2, !dbg !14
|
87 |
+
%71 = or i32 %7, 3, !dbg !14
|
88 |
+
%72 = zext nneg i32 %71 to i64, !dbg !14
|
89 |
+
%73 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %72, !dbg !14
|
90 |
+
%74 = load i16, ptr addrspace(3) %73, align 2, !dbg !14
|
91 |
+
%75 = zext nneg i32 %8 to i64, !dbg !14
|
92 |
+
%76 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %75, !dbg !14
|
93 |
+
%77 = load i16, ptr addrspace(3) %76, align 2, !dbg !14
|
94 |
+
%78 = or i32 %7, 513, !dbg !14
|
95 |
+
%79 = zext nneg i32 %78 to i64, !dbg !14
|
96 |
+
%80 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %79, !dbg !14
|
97 |
+
%81 = load i16, ptr addrspace(3) %80, align 2, !dbg !14
|
98 |
+
%82 = or i32 %7, 514, !dbg !14
|
99 |
+
%83 = zext nneg i32 %82 to i64, !dbg !14
|
100 |
+
%84 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %83, !dbg !14
|
101 |
+
%85 = load i16, ptr addrspace(3) %84, align 2, !dbg !14
|
102 |
+
%86 = or i32 %7, 515, !dbg !14
|
103 |
+
%87 = zext nneg i32 %86 to i64, !dbg !14
|
104 |
+
%88 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %87, !dbg !14
|
105 |
+
%89 = load i16, ptr addrspace(3) %88, align 2, !dbg !14
|
106 |
+
%90 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %62) #2, !dbg !14
|
107 |
+
%91 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %66) #2, !dbg !14
|
108 |
+
%92 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %70) #2, !dbg !14
|
109 |
+
%93 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %74) #2, !dbg !14
|
110 |
+
%94 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %77) #2, !dbg !14
|
111 |
+
%95 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %81) #2, !dbg !14
|
112 |
+
%96 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %85) #2, !dbg !14
|
113 |
+
%97 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %89) #2, !dbg !14
|
114 |
+
%98 = sext i32 %12 to i64, !dbg !15
|
115 |
+
%99 = getelementptr float, ptr addrspace(1) %1, i64 %98, !dbg !15
|
116 |
+
%100 = sext i32 %13 to i64, !dbg !15
|
117 |
+
%101 = getelementptr float, ptr addrspace(1) %1, i64 %100, !dbg !15
|
118 |
+
%102 = bitcast float %90 to i32, !dbg !16
|
119 |
+
%103 = bitcast float %91 to i32, !dbg !16
|
120 |
+
%104 = bitcast float %92 to i32, !dbg !16
|
121 |
+
%105 = bitcast float %93 to i32, !dbg !16
|
122 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %102, i32 %103, i32 %104, i32 %105, ptr addrspace(1) %99, i1 true) #2, !dbg !16
|
123 |
+
%106 = bitcast float %94 to i32, !dbg !16
|
124 |
+
%107 = bitcast float %95 to i32, !dbg !16
|
125 |
+
%108 = bitcast float %96 to i32, !dbg !16
|
126 |
+
%109 = bitcast float %97 to i32, !dbg !16
|
127 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %106, i32 %107, i32 %108, i32 %109, ptr addrspace(1) %101, i1 true) #2, !dbg !16
|
128 |
+
ret void, !dbg !17
|
129 |
+
}
|
130 |
+
|
131 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
132 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
133 |
+
|
134 |
+
; Function Attrs: convergent nocallback nounwind
|
135 |
+
declare void @llvm.nvvm.barrier0() #1
|
136 |
+
|
137 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
138 |
+
attributes #1 = { convergent nocallback nounwind }
|
139 |
+
attributes #2 = { nounwind }
|
140 |
+
|
141 |
+
!llvm.module.flags = !{!0}
|
142 |
+
!llvm.dbg.cu = !{!1}
|
143 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
144 |
+
|
145 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
146 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
147 |
+
!2 = !DIFile(filename: "ck62k2xzbb657snfdowwanzszaij6qzw6vuc7cfidomjpkk6igcm.py", directory: "/tmp/torchinductor_root/k6")
|
148 |
+
!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
|
149 |
+
!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
|
150 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
151 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
152 |
+
!7 = !{}
|
153 |
+
!8 = !DILocation(line: 21, column: 36, scope: !5)
|
154 |
+
!9 = !DILocation(line: 20, column: 28, scope: !5)
|
155 |
+
!10 = !DILocation(line: 20, column: 33, scope: !5)
|
156 |
+
!11 = !DILocation(line: 21, column: 23, scope: !5)
|
157 |
+
!12 = !DILocation(line: 24, column: 30, scope: !5)
|
158 |
+
!13 = !DILocation(line: 24, column: 35, scope: !5)
|
159 |
+
!14 = !DILocation(line: 24, column: 44, scope: !5)
|
160 |
+
!15 = !DILocation(line: 26, column: 25, scope: !5)
|
161 |
+
!16 = !DILocation(line: 26, column: 36, scope: !5)
|
162 |
+
!17 = !DILocation(line: 26, column: 4, scope: !5)
|
.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ptx
ADDED
@@ -0,0 +1,338 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2de
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2de(
|
13 |
+
.param .u64 triton__0d1d2de_param_0,
|
14 |
+
.param .u64 triton__0d1d2de_param_1,
|
15 |
+
.param .u32 triton__0d1d2de_param_2
|
16 |
+
)
|
17 |
+
.maxntid 128, 1, 1
|
18 |
+
{
|
19 |
+
.reg .pred %p<4>;
|
20 |
+
.reg .b16 %rs<9>;
|
21 |
+
.reg .b32 %r<37>;
|
22 |
+
.reg .b64 %rd<13>;
|
23 |
+
.loc 1 18 0
|
24 |
+
$L__func_begin0:
|
25 |
+
.loc 1 18 0
|
26 |
+
|
27 |
+
ld.param.u64 %rd4, [triton__0d1d2de_param_0];
|
28 |
+
ld.param.u64 %rd5, [triton__0d1d2de_param_1];
|
29 |
+
$L__tmp0:
|
30 |
+
.loc 1 21 36
|
31 |
+
mov.u32 %r22, %tid.x;
|
32 |
+
and.b32 %r23, %r22, 127;
|
33 |
+
shl.b32 %r24, %r23, 3;
|
34 |
+
shl.b32 %r25, %r23, 2;
|
35 |
+
.loc 1 20 28
|
36 |
+
mov.u32 %r1, %ctaid.x;
|
37 |
+
.loc 1 20 33
|
38 |
+
shl.b32 %r26, %r1, 10;
|
39 |
+
.loc 1 21 23
|
40 |
+
or.b32 %r27, %r26, %r24;
|
41 |
+
or.b32 %r28, %r26, %r25;
|
42 |
+
.loc 1 24 30
|
43 |
+
mul.wide.s32 %rd6, %r27, 2;
|
44 |
+
add.s64 %rd1, %rd4, %rd6;
|
45 |
+
mov.pred %p1, -1;
|
46 |
+
.loc 1 24 35
|
47 |
+
mov.u32 %r2, 0x0;
|
48 |
+
mov.u32 %r3, 0x0;
|
49 |
+
mov.u32 %r4, 0x0;
|
50 |
+
mov.u32 %r5, 0x0;
|
51 |
+
@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
|
52 |
+
shr.u32 %r29, %r2, 16;
|
53 |
+
shr.u32 %r30, %r3, 16;
|
54 |
+
shr.u32 %r31, %r4, 16;
|
55 |
+
shr.u32 %r32, %r5, 16;
|
56 |
+
.loc 1 24 44
|
57 |
+
shl.b32 %r33, %r23, 4;
|
58 |
+
mov.u32 %r34, global_smem;
|
59 |
+
add.s32 %r35, %r34, %r33;
|
60 |
+
st.shared.u16 [%r35], %r2;
|
61 |
+
st.shared.u16 [%r35+2], %r29;
|
62 |
+
st.shared.u16 [%r35+4], %r3;
|
63 |
+
st.shared.u16 [%r35+6], %r30;
|
64 |
+
st.shared.u16 [%r35+8], %r4;
|
65 |
+
st.shared.u16 [%r35+10], %r31;
|
66 |
+
st.shared.u16 [%r35+12], %r5;
|
67 |
+
st.shared.u16 [%r35+14], %r32;
|
68 |
+
bar.sync 0;
|
69 |
+
add.s32 %r36, %r34, %r24;
|
70 |
+
ld.shared.u16 %rs1, [%r36];
|
71 |
+
ld.shared.u16 %rs2, [%r36+2];
|
72 |
+
ld.shared.u16 %rs3, [%r36+4];
|
73 |
+
ld.shared.u16 %rs4, [%r36+6];
|
74 |
+
ld.shared.u16 %rs5, [%r36+1024];
|
75 |
+
ld.shared.u16 %rs6, [%r36+1026];
|
76 |
+
ld.shared.u16 %rs7, [%r36+1028];
|
77 |
+
ld.shared.u16 %rs8, [%r36+1030];
|
78 |
+
cvt.f32.bf16 %r14, %rs1;
|
79 |
+
cvt.f32.bf16 %r15, %rs2;
|
80 |
+
cvt.f32.bf16 %r16, %rs3;
|
81 |
+
cvt.f32.bf16 %r17, %rs4;
|
82 |
+
cvt.f32.bf16 %r18, %rs5;
|
83 |
+
cvt.f32.bf16 %r19, %rs6;
|
84 |
+
cvt.f32.bf16 %r20, %rs7;
|
85 |
+
cvt.f32.bf16 %r21, %rs8;
|
86 |
+
.loc 1 26 25
|
87 |
+
mul.wide.s32 %rd7, %r28, 4;
|
88 |
+
add.s64 %rd2, %rd5, %rd7;
|
89 |
+
cvt.s64.s32 %rd8, %r26;
|
90 |
+
cvt.u64.u32 %rd9, %r25;
|
91 |
+
or.b64 %rd10, %rd8, %rd9;
|
92 |
+
shl.b64 %rd11, %rd10, 2;
|
93 |
+
add.s64 %rd12, %rd5, %rd11;
|
94 |
+
add.s64 %rd3, %rd12, 2048;
|
95 |
+
.loc 1 26 36
|
96 |
+
@%p1 st.global.v4.b32 [ %rd2 + 0 ], { %r14, %r15, %r16, %r17 };
|
97 |
+
@%p1 st.global.v4.b32 [ %rd3 + 0 ], { %r18, %r19, %r20, %r21 };
|
98 |
+
.loc 1 26 4
|
99 |
+
ret;
|
100 |
+
$L__tmp1:
|
101 |
+
$L__func_end0:
|
102 |
+
|
103 |
+
}
|
104 |
+
.file 1 "/tmp/torchinductor_root/k6/ck62k2xzbb657snfdowwanzszaij6qzw6vuc7cfidomjpkk6igcm.py"
|
105 |
+
.section .debug_abbrev
|
106 |
+
{
|
107 |
+
.b8 1
|
108 |
+
.b8 17
|
109 |
+
.b8 1
|
110 |
+
.b8 37
|
111 |
+
.b8 8
|
112 |
+
.b8 19
|
113 |
+
.b8 5
|
114 |
+
.b8 3
|
115 |
+
.b8 8
|
116 |
+
.b8 16
|
117 |
+
.b8 6
|
118 |
+
.b8 27
|
119 |
+
.b8 8
|
120 |
+
.b8 180
|
121 |
+
.b8 66
|
122 |
+
.b8 12
|
123 |
+
.b8 17
|
124 |
+
.b8 1
|
125 |
+
.b8 18
|
126 |
+
.b8 1
|
127 |
+
.b8 0
|
128 |
+
.b8 0
|
129 |
+
.b8 2
|
130 |
+
.b8 46
|
131 |
+
.b8 0
|
132 |
+
.b8 17
|
133 |
+
.b8 1
|
134 |
+
.b8 18
|
135 |
+
.b8 1
|
136 |
+
.b8 64
|
137 |
+
.b8 10
|
138 |
+
.b8 135
|
139 |
+
.b8 64
|
140 |
+
.b8 8
|
141 |
+
.b8 3
|
142 |
+
.b8 8
|
143 |
+
.b8 58
|
144 |
+
.b8 11
|
145 |
+
.b8 59
|
146 |
+
.b8 11
|
147 |
+
.b8 63
|
148 |
+
.b8 12
|
149 |
+
.b8 0
|
150 |
+
.b8 0
|
151 |
+
.b8 0
|
152 |
+
}
|
153 |
+
.section .debug_info
|
154 |
+
{
|
155 |
+
.b32 176
|
156 |
+
.b8 2
|
157 |
+
.b8 0
|
158 |
+
.b32 .debug_abbrev
|
159 |
+
.b8 8
|
160 |
+
.b8 1
|
161 |
+
.b8 116
|
162 |
+
.b8 114
|
163 |
+
.b8 105
|
164 |
+
.b8 116
|
165 |
+
.b8 111
|
166 |
+
.b8 110
|
167 |
+
.b8 0
|
168 |
+
.b8 2
|
169 |
+
.b8 0
|
170 |
+
.b8 99
|
171 |
+
.b8 107
|
172 |
+
.b8 54
|
173 |
+
.b8 50
|
174 |
+
.b8 107
|
175 |
+
.b8 50
|
176 |
+
.b8 120
|
177 |
+
.b8 122
|
178 |
+
.b8 98
|
179 |
+
.b8 98
|
180 |
+
.b8 54
|
181 |
+
.b8 53
|
182 |
+
.b8 55
|
183 |
+
.b8 115
|
184 |
+
.b8 110
|
185 |
+
.b8 102
|
186 |
+
.b8 100
|
187 |
+
.b8 111
|
188 |
+
.b8 119
|
189 |
+
.b8 119
|
190 |
+
.b8 97
|
191 |
+
.b8 110
|
192 |
+
.b8 122
|
193 |
+
.b8 115
|
194 |
+
.b8 122
|
195 |
+
.b8 97
|
196 |
+
.b8 105
|
197 |
+
.b8 106
|
198 |
+
.b8 54
|
199 |
+
.b8 113
|
200 |
+
.b8 122
|
201 |
+
.b8 119
|
202 |
+
.b8 54
|
203 |
+
.b8 118
|
204 |
+
.b8 117
|
205 |
+
.b8 99
|
206 |
+
.b8 55
|
207 |
+
.b8 99
|
208 |
+
.b8 102
|
209 |
+
.b8 105
|
210 |
+
.b8 100
|
211 |
+
.b8 111
|
212 |
+
.b8 109
|
213 |
+
.b8 106
|
214 |
+
.b8 112
|
215 |
+
.b8 107
|
216 |
+
.b8 107
|
217 |
+
.b8 54
|
218 |
+
.b8 105
|
219 |
+
.b8 103
|
220 |
+
.b8 99
|
221 |
+
.b8 109
|
222 |
+
.b8 46
|
223 |
+
.b8 112
|
224 |
+
.b8 121
|
225 |
+
.b8 0
|
226 |
+
.b32 .debug_line
|
227 |
+
.b8 47
|
228 |
+
.b8 116
|
229 |
+
.b8 109
|
230 |
+
.b8 112
|
231 |
+
.b8 47
|
232 |
+
.b8 116
|
233 |
+
.b8 111
|
234 |
+
.b8 114
|
235 |
+
.b8 99
|
236 |
+
.b8 104
|
237 |
+
.b8 105
|
238 |
+
.b8 110
|
239 |
+
.b8 100
|
240 |
+
.b8 117
|
241 |
+
.b8 99
|
242 |
+
.b8 116
|
243 |
+
.b8 111
|
244 |
+
.b8 114
|
245 |
+
.b8 95
|
246 |
+
.b8 114
|
247 |
+
.b8 111
|
248 |
+
.b8 111
|
249 |
+
.b8 116
|
250 |
+
.b8 47
|
251 |
+
.b8 107
|
252 |
+
.b8 54
|
253 |
+
.b8 0
|
254 |
+
.b8 1
|
255 |
+
.b64 $L__func_begin0
|
256 |
+
.b64 $L__func_end0
|
257 |
+
.b8 2
|
258 |
+
.b64 $L__func_begin0
|
259 |
+
.b64 $L__func_end0
|
260 |
+
.b8 1
|
261 |
+
.b8 156
|
262 |
+
.b8 116
|
263 |
+
.b8 114
|
264 |
+
.b8 105
|
265 |
+
.b8 116
|
266 |
+
.b8 111
|
267 |
+
.b8 110
|
268 |
+
.b8 95
|
269 |
+
.b8 95
|
270 |
+
.b8 48
|
271 |
+
.b8 100
|
272 |
+
.b8 49
|
273 |
+
.b8 100
|
274 |
+
.b8 50
|
275 |
+
.b8 100
|
276 |
+
.b8 101
|
277 |
+
.b8 0
|
278 |
+
.b8 116
|
279 |
+
.b8 114
|
280 |
+
.b8 105
|
281 |
+
.b8 116
|
282 |
+
.b8 111
|
283 |
+
.b8 110
|
284 |
+
.b8 95
|
285 |
+
.b8 95
|
286 |
+
.b8 48
|
287 |
+
.b8 100
|
288 |
+
.b8 49
|
289 |
+
.b8 100
|
290 |
+
.b8 50
|
291 |
+
.b8 100
|
292 |
+
.b8 101
|
293 |
+
.b8 0
|
294 |
+
.b8 1
|
295 |
+
.b8 18
|
296 |
+
.b8 1
|
297 |
+
.b8 0
|
298 |
+
}
|
299 |
+
.section .debug_pubnames
|
300 |
+
{
|
301 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
302 |
+
$L__pubNames_start0:
|
303 |
+
.b8 2
|
304 |
+
.b8 0
|
305 |
+
.b32 .debug_info
|
306 |
+
.b32 180
|
307 |
+
.b32 125
|
308 |
+
.b8 116
|
309 |
+
.b8 114
|
310 |
+
.b8 105
|
311 |
+
.b8 116
|
312 |
+
.b8 111
|
313 |
+
.b8 110
|
314 |
+
.b8 95
|
315 |
+
.b8 95
|
316 |
+
.b8 48
|
317 |
+
.b8 100
|
318 |
+
.b8 49
|
319 |
+
.b8 100
|
320 |
+
.b8 50
|
321 |
+
.b8 100
|
322 |
+
.b8 101
|
323 |
+
.b8 0
|
324 |
+
.b32 0
|
325 |
+
$L__pubNames_end0:
|
326 |
+
}
|
327 |
+
.section .debug_pubtypes
|
328 |
+
{
|
329 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
330 |
+
$L__pubTypes_start0:
|
331 |
+
.b8 2
|
332 |
+
.b8 0
|
333 |
+
.b32 .debug_info
|
334 |
+
.b32 180
|
335 |
+
.b32 0
|
336 |
+
$L__pubTypes_end0:
|
337 |
+
}
|
338 |
+
.section .debug_loc { }
|
.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ttgir
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
5 |
+
%c1024_i32 = arith.constant 1024 : i32
|
6 |
+
%0 = tt.get_program_id x : i32
|
7 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
8 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
|
9 |
+
%3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked1>
|
10 |
+
%4 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
|
11 |
+
%5 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked1>
|
12 |
+
%6 = arith.addi %4, %2 : tensor<1024xi32, #blocked>
|
13 |
+
%7 = arith.addi %5, %3 : tensor<1024xi32, #blocked1>
|
14 |
+
%8 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
|
15 |
+
%9 = tt.addptr %8, %6 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
|
16 |
+
%10 = tt.load %9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
|
17 |
+
%11 = triton_gpu.convert_layout %10 : (tensor<1024xbf16, #blocked>) -> tensor<1024xbf16, #blocked1>
|
18 |
+
%12 = arith.extf %11 : tensor<1024xbf16, #blocked1> to tensor<1024xf32, #blocked1>
|
19 |
+
%13 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked1>
|
20 |
+
%14 = tt.addptr %13, %7 : tensor<1024x!tt.ptr<f32, 1>, #blocked1>, tensor<1024xi32, #blocked1>
|
21 |
+
tt.store %14, %12 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked1>
|
22 |
+
tt.return
|
23 |
+
}
|
24 |
+
}
|
.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ttir
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%c1024_i32 = arith.constant 1024 : i32
|
4 |
+
%0 = tt.get_program_id x : i32
|
5 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
6 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
|
7 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32>
|
8 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32>
|
9 |
+
%5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
|
10 |
+
%6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
|
11 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
|
12 |
+
%8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32>
|
13 |
+
%9 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
|
14 |
+
%10 = tt.addptr %9, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
|
15 |
+
tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32>
|
16 |
+
tt.return
|
17 |
+
}
|
18 |
+
}
|
.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.cubin
ADDED
Binary file (10.3 kB). View file
|
|
.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.llir
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
|
6 |
+
define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
|
7 |
+
%6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
8 |
+
%7 = and i32 %6, 63, !dbg !8
|
9 |
+
%8 = lshr i32 %6, 6, !dbg !9
|
10 |
+
%9 = and i32 %8, 3, !dbg !9
|
11 |
+
%10 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
|
12 |
+
%11 = shl i32 %10, 6, !dbg !11
|
13 |
+
%12 = or i32 %11, %7, !dbg !12
|
14 |
+
br label %13, !dbg !13
|
15 |
+
|
16 |
+
13: ; preds = %5, %13
|
17 |
+
%14 = phi float [ 0.000000e+00, %5 ], [ %23, %13 ]
|
18 |
+
%15 = phi i32 [ 0, %5 ], [ %24, %13 ]
|
19 |
+
%16 = or i32 %15, %9, !dbg !14
|
20 |
+
%17 = shl i32 %16, 17, !dbg !15
|
21 |
+
%18 = add i32 %17, %12, !dbg !16
|
22 |
+
%19 = sext i32 %18 to i64, !dbg !17
|
23 |
+
%20 = getelementptr float, ptr addrspace(1) %0, i64 %19, !dbg !17
|
24 |
+
%21 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %20, i1 true, i32 0, i1 true) #3, !dbg !18
|
25 |
+
%22 = bitcast i32 %21 to float, !dbg !18
|
26 |
+
%23 = fadd float %14, %22, !dbg !19
|
27 |
+
%24 = add nuw nsw i32 %15, 4, !dbg !13
|
28 |
+
%25 = icmp ult i32 %15, 116, !dbg !13
|
29 |
+
br i1 %25, label %13, label %26, !dbg !13
|
30 |
+
|
31 |
+
26: ; preds = %13
|
32 |
+
%27 = shl nuw nsw i32 %7, 2, !dbg !20
|
33 |
+
%28 = or i32 %27, %9, !dbg !20
|
34 |
+
%29 = zext nneg i32 %28 to i64, !dbg !20
|
35 |
+
%30 = getelementptr float, ptr addrspace(3) @global_smem, i64 %29, !dbg !20
|
36 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %30, float %23, i1 true) #3, !dbg !20
|
37 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !20
|
38 |
+
%31 = icmp slt i32 %6, 256, !dbg !20
|
39 |
+
%32 = sext i32 %6 to i64, !dbg !20
|
40 |
+
%33 = getelementptr float, ptr addrspace(3) @global_smem, i64 %32, !dbg !20
|
41 |
+
%34 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %33, i1 %31) #3, !dbg !20
|
42 |
+
%35 = bitcast float %34 to i32, !dbg !20
|
43 |
+
%36 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %35, i32 2, i32 31), !dbg !20
|
44 |
+
%37 = bitcast i32 %36 to float, !dbg !20
|
45 |
+
%38 = fadd float %34, %37, !dbg !24
|
46 |
+
%39 = bitcast float %38 to i32, !dbg !20
|
47 |
+
%40 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %39, i32 1, i32 31), !dbg !20
|
48 |
+
%41 = bitcast i32 %40 to float, !dbg !20
|
49 |
+
%42 = fadd float %38, %41, !dbg !24
|
50 |
+
%43 = and i32 %6, 3, !dbg !20
|
51 |
+
%44 = icmp eq i32 %43, 0, !dbg !20
|
52 |
+
%45 = and i1 %31, %44, !dbg !20
|
53 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %33, float %42, i1 %45) #3, !dbg !20
|
54 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !20
|
55 |
+
%46 = zext nneg i32 %27 to i64, !dbg !20
|
56 |
+
%47 = getelementptr float, ptr addrspace(3) @global_smem, i64 %46, !dbg !20
|
57 |
+
%48 = load float, ptr addrspace(3) %47, align 4, !dbg !20
|
58 |
+
%.frozen = freeze i32 %12
|
59 |
+
%49 = sdiv i32 %.frozen, 256, !dbg !28
|
60 |
+
%50 = mul i32 %49, 256
|
61 |
+
%.decomposed = sub i32 %.frozen, %50
|
62 |
+
%51 = sext i32 %49 to i64, !dbg !29
|
63 |
+
%52 = getelementptr i64, ptr addrspace(1) %1, i64 %51, !dbg !29
|
64 |
+
%53 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %52, i1 true) #3, !dbg !30
|
65 |
+
%54 = lshr i64 %53, 54, !dbg !31
|
66 |
+
%55 = and i64 %54, 512, !dbg !31
|
67 |
+
%56 = add i64 %55, %53, !dbg !31
|
68 |
+
%57 = shl i64 %56, 8, !dbg !32
|
69 |
+
%58 = sext i32 %.decomposed to i64, !dbg !33
|
70 |
+
%59 = getelementptr float, ptr addrspace(1) %2, i64 %57, !dbg !34
|
71 |
+
%60 = getelementptr float, ptr addrspace(1) %59, i64 %58, !dbg !34
|
72 |
+
%61 = icmp eq i32 %9, 0, !dbg !35
|
73 |
+
%62 = insertelement <1 x float> undef, float %48, i64 0, !dbg !35
|
74 |
+
%63 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %60, <1 x float> %62, i1 %61) #3, !dbg !35
|
75 |
+
ret void, !dbg !36
|
76 |
+
}
|
77 |
+
|
78 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
79 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
80 |
+
|
81 |
+
; Function Attrs: convergent nocallback nounwind
|
82 |
+
declare void @llvm.nvvm.barrier0() #1
|
83 |
+
|
84 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
85 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
|
86 |
+
|
87 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
88 |
+
attributes #1 = { convergent nocallback nounwind }
|
89 |
+
attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
90 |
+
attributes #3 = { nounwind }
|
91 |
+
|
92 |
+
!llvm.module.flags = !{!0}
|
93 |
+
!llvm.dbg.cu = !{!1}
|
94 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
95 |
+
|
96 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
97 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
98 |
+
!2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i")
|
99 |
+
!3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1}
|
100 |
+
!4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 256}
|
101 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
102 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
103 |
+
!7 = !{}
|
104 |
+
!8 = !DILocation(line: 22, column: 44, scope: !5)
|
105 |
+
!9 = !DILocation(line: 24, column: 33, scope: !5)
|
106 |
+
!10 = !DILocation(line: 21, column: 28, scope: !5)
|
107 |
+
!11 = !DILocation(line: 21, column: 33, scope: !5)
|
108 |
+
!12 = !DILocation(line: 22, column: 23, scope: !5)
|
109 |
+
!13 = !DILocation(line: 27, column: 36, scope: !5)
|
110 |
+
!14 = !DILocation(line: 28, column: 27, scope: !5)
|
111 |
+
!15 = !DILocation(line: 31, column: 47, scope: !5)
|
112 |
+
!16 = !DILocation(line: 31, column: 40, scope: !5)
|
113 |
+
!17 = !DILocation(line: 31, column: 34, scope: !5)
|
114 |
+
!18 = !DILocation(line: 31, column: 53, scope: !5)
|
115 |
+
!19 = !DILocation(line: 34, column: 38, scope: !5)
|
116 |
+
!20 = !DILocation(line: 243, column: 36, scope: !21, inlinedAt: !23)
|
117 |
+
!21 = distinct !DILexicalBlockFile(scope: !5, file: !22, discriminator: 0)
|
118 |
+
!22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
119 |
+
!23 = !DILocation(line: 35, column: 25, scope: !21)
|
120 |
+
!24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26)
|
121 |
+
!25 = distinct !DILexicalBlockFile(scope: !21, file: !22, discriminator: 0)
|
122 |
+
!26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27)
|
123 |
+
!27 = !DILocation(line: 35, column: 25, scope: !25)
|
124 |
+
!28 = !DILocation(line: 36, column: 20, scope: !5)
|
125 |
+
!29 = !DILocation(line: 38, column: 30, scope: !5)
|
126 |
+
!30 = !DILocation(line: 38, column: 35, scope: !5)
|
127 |
+
!31 = !DILocation(line: 41, column: 32, scope: !5)
|
128 |
+
!32 = !DILocation(line: 45, column: 40, scope: !5)
|
129 |
+
!33 = !DILocation(line: 45, column: 36, scope: !5)
|
130 |
+
!34 = !DILocation(line: 45, column: 30, scope: !5)
|
131 |
+
!35 = !DILocation(line: 45, column: 55, scope: !5)
|
132 |
+
!36 = !DILocation(line: 45, column: 4, scope: !5)
|
.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ttgir
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<256> : tensor<64x1xi64, #blocked>
|
5 |
+
%cst_0 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
|
6 |
+
%cst_1 = arith.constant dense<512> : tensor<64x1xi64, #blocked>
|
7 |
+
%cst_2 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
|
8 |
+
%cst_3 = arith.constant dense<131072> : tensor<1x4xi32, #blocked>
|
9 |
+
%cst_4 = arith.constant dense<120> : tensor<1x4xi32, #blocked>
|
10 |
+
%c0_i32 = arith.constant 0 : i32
|
11 |
+
%c120_i32 = arith.constant 120 : i32
|
12 |
+
%c4_i32 = arith.constant 4 : i32
|
13 |
+
%cst_5 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked>
|
14 |
+
%cst_6 = arith.constant dense<true> : tensor<64x1xi1, #blocked>
|
15 |
+
%c64_i32 = arith.constant 64 : i32
|
16 |
+
%0 = tt.get_program_id x : i32
|
17 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
18 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
19 |
+
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
|
20 |
+
%4 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
|
21 |
+
%5 = arith.addi %4, %3 : tensor<64x1xi32, #blocked>
|
22 |
+
%6 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
23 |
+
%7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x4xi32, #blocked>
|
24 |
+
%8 = tt.broadcast %5 : (tensor<64x1xi32, #blocked>) -> tensor<64x4xi32, #blocked>
|
25 |
+
%9 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>, #blocked>
|
26 |
+
%10 = scf.for %arg5 = %c0_i32 to %c120_i32 step %c4_i32 iter_args(%arg6 = %cst_5) -> (tensor<64x4xf32, #blocked>) : i32 {
|
27 |
+
%27 = tt.splat %arg5 : (i32) -> tensor<1x4xi32, #blocked>
|
28 |
+
%28 = arith.addi %27, %7 : tensor<1x4xi32, #blocked>
|
29 |
+
%29 = arith.cmpi slt, %28, %cst_4 : tensor<1x4xi32, #blocked>
|
30 |
+
%30 = arith.muli %28, %cst_3 : tensor<1x4xi32, #blocked>
|
31 |
+
%31 = tt.broadcast %30 : (tensor<1x4xi32, #blocked>) -> tensor<64x4xi32, #blocked>
|
32 |
+
%32 = arith.addi %8, %31 : tensor<64x4xi32, #blocked>
|
33 |
+
%33 = tt.addptr %9, %32 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi32, #blocked>
|
34 |
+
%34 = tt.broadcast %29 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked>
|
35 |
+
%35 = tt.load %33, %34, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
|
36 |
+
%36 = arith.addf %arg6, %35 : tensor<64x4xf32, #blocked>
|
37 |
+
%37 = arith.select %34, %36, %arg6 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
|
38 |
+
scf.yield %37 : tensor<64x4xf32, #blocked>
|
39 |
+
}
|
40 |
+
%11 = "tt.reduce"(%10) <{axis = 1 : i32}> ({
|
41 |
+
^bb0(%arg5: f32, %arg6: f32):
|
42 |
+
%27 = arith.addf %arg5, %arg6 : f32
|
43 |
+
tt.reduce.return %27 : f32
|
44 |
+
}) : (tensor<64x4xf32, #blocked>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
45 |
+
%12 = tt.expand_dims %11 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
|
46 |
+
%13 = arith.divsi %5, %cst_2 : tensor<64x1xi32, #blocked>
|
47 |
+
%14 = arith.remsi %5, %cst_2 : tensor<64x1xi32, #blocked>
|
48 |
+
%15 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
|
49 |
+
%16 = tt.addptr %15, %13 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
|
50 |
+
%17 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
|
51 |
+
%18 = arith.addi %17, %cst_1 : tensor<64x1xi64, #blocked>
|
52 |
+
%19 = arith.cmpi slt, %17, %cst_0 : tensor<64x1xi64, #blocked>
|
53 |
+
%20 = arith.select %19, %18, %17 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
|
54 |
+
%21 = arith.muli %20, %cst : tensor<64x1xi64, #blocked>
|
55 |
+
%22 = arith.extsi %14 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked>
|
56 |
+
%23 = arith.addi %22, %21 : tensor<64x1xi64, #blocked>
|
57 |
+
%24 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>, #blocked>
|
58 |
+
%25 = tt.addptr %24, %23 : tensor<64x1x!tt.ptr<f32, 1>, #blocked>, tensor<64x1xi64, #blocked>
|
59 |
+
%26 = "tt.atomic_rmw"(%25, %12, %cst_6) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<64x1x!tt.ptr<f32, 1>, #blocked>, tensor<64x1xf32, #blocked>, tensor<64x1xi1, #blocked>) -> tensor<64x1xf32, #blocked>
|
60 |
+
tt.return
|
61 |
+
}
|
62 |
+
}
|
.triton/dump/415aac87553b7d064f52694fa7254686/triton_.cubin
ADDED
Binary file (24.1 kB). View file
|
|
.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttgir
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<256> : tensor<256xi32, #blocked>
|
5 |
+
%cst_0 = arith.constant 9.99999974E-6 : f32
|
6 |
+
%cst_1 = arith.constant 2.560000e+02 : f32
|
7 |
+
%cst_2 = arith.constant 0.000000e+00 : f32
|
8 |
+
%c256_i32 = arith.constant 256 : i32
|
9 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
|
10 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
|
11 |
+
%0 = tt.get_program_id x : i32
|
12 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
|
13 |
+
%2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
|
14 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
15 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
|
16 |
+
%5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
|
17 |
+
%6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
18 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
19 |
+
%8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
20 |
+
%9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
21 |
+
%10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
22 |
+
%11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
23 |
+
%12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
24 |
+
%13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
25 |
+
%14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
26 |
+
%15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
27 |
+
%16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
28 |
+
%17 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
29 |
+
%18 = tt.addptr %17, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
30 |
+
%19 = tt.load %18, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
31 |
+
%20 = arith.addf %8, %12 : tensor<256xf32, #blocked>
|
32 |
+
%21 = arith.addf %20, %16 : tensor<256xf32, #blocked>
|
33 |
+
%22 = arith.select %2, %21, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
34 |
+
%23 = "tt.reduce"(%22) <{axis = 0 : i32}> ({
|
35 |
+
^bb0(%arg7: f32, %arg8: f32):
|
36 |
+
%40 = arith.addf %arg7, %arg8 : f32
|
37 |
+
tt.reduce.return %40 : f32
|
38 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
39 |
+
%24 = arith.addf %23, %cst_2 : f32
|
40 |
+
%25 = arith.divf %24, %cst_1 : f32
|
41 |
+
%26 = tt.splat %25 : (f32) -> tensor<256xf32, #blocked>
|
42 |
+
%27 = arith.subf %21, %26 : tensor<256xf32, #blocked>
|
43 |
+
%28 = arith.mulf %27, %27 : tensor<256xf32, #blocked>
|
44 |
+
%29 = arith.select %2, %28, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
45 |
+
%30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({
|
46 |
+
^bb0(%arg7: f32, %arg8: f32):
|
47 |
+
%40 = arith.addf %arg7, %arg8 : f32
|
48 |
+
tt.reduce.return %40 : f32
|
49 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
50 |
+
%31 = arith.addf %30, %cst_2 : f32
|
51 |
+
%32 = arith.divf %31, %cst_1 : f32
|
52 |
+
%33 = arith.addf %32, %cst_0 : f32
|
53 |
+
%34 = tt.extern_elementwise %33 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
|
54 |
+
%35 = tt.splat %34 : (f32) -> tensor<256xf32, #blocked>
|
55 |
+
%36 = arith.mulf %27, %35 : tensor<256xf32, #blocked>
|
56 |
+
%37 = arith.mulf %36, %19 : tensor<256xf32, #blocked>
|
57 |
+
%38 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
58 |
+
%39 = tt.addptr %38, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
59 |
+
tt.store %39, %37, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
|
60 |
+
tt.return
|
61 |
+
}
|
62 |
+
}
|
.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.cubin
ADDED
Binary file (7.46 kB). View file
|
|
.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.ptx
ADDED
@@ -0,0 +1,465 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d34e
|
10 |
+
|
11 |
+
.visible .entry triton__0d1d2d34e(
|
12 |
+
.param .u64 triton__0d1d2d34e_param_0,
|
13 |
+
.param .u64 triton__0d1d2d34e_param_1,
|
14 |
+
.param .u64 triton__0d1d2d34e_param_2,
|
15 |
+
.param .u32 triton__0d1d2d34e_param_3,
|
16 |
+
.param .u32 triton__0d1d2d34e_param_4
|
17 |
+
)
|
18 |
+
.maxntid 64, 1, 1
|
19 |
+
{
|
20 |
+
.reg .pred %p<6>;
|
21 |
+
.reg .b32 %r<27>;
|
22 |
+
.reg .f32 %f<9>;
|
23 |
+
.reg .b64 %rd<24>;
|
24 |
+
.loc 1 18 0
|
25 |
+
$L__func_begin0:
|
26 |
+
.loc 1 18 0
|
27 |
+
|
28 |
+
ld.param.u64 %rd4, [triton__0d1d2d34e_param_0];
|
29 |
+
ld.param.u64 %rd5, [triton__0d1d2d34e_param_1];
|
30 |
+
$L__tmp0:
|
31 |
+
.loc 1 25 34
|
32 |
+
mov.u32 %r7, %tid.x;
|
33 |
+
and.b32 %r8, %r7, 7;
|
34 |
+
ld.param.u64 %rd6, [triton__0d1d2d34e_param_2];
|
35 |
+
.loc 1 28 30
|
36 |
+
mul.wide.u32 %rd7, %r8, 4;
|
37 |
+
add.s64 %rd1, %rd5, %rd7;
|
38 |
+
mov.b32 %r2, 0;
|
39 |
+
mov.pred %p1, -1;
|
40 |
+
.loc 1 28 35
|
41 |
+
mov.u32 %r1, 0x0;
|
42 |
+
@%p1 ld.global.b32 { %r1 }, [ %rd1 + 0 ];
|
43 |
+
@!%p1 mov.u32 %r1, %r2;
|
44 |
+
mov.b32 %f1, %r1;
|
45 |
+
.loc 1 29 30
|
46 |
+
mul.wide.u32 %rd8, %r8, 8;
|
47 |
+
add.s64 %rd3, %rd6, %rd8;
|
48 |
+
.loc 1 29 35
|
49 |
+
mov.u64 %rd2, 0x0;
|
50 |
+
@%p1 ld.global.b64 { %rd2 }, [ %rd3 + 0 ];
|
51 |
+
@!%p1 mov.u64 %rd2, 0x0;
|
52 |
+
$L__tmp1:
|
53 |
+
.loc 2 243 36
|
54 |
+
shfl.sync.bfly.b32 %r9, %r1, 4, 31, -1;
|
55 |
+
mov.b32 %f2, %r9;
|
56 |
+
$L__tmp2:
|
57 |
+
.loc 2 233 15
|
58 |
+
add.f32 %f3, %f1, %f2;
|
59 |
+
$L__tmp3:
|
60 |
+
.loc 2 243 36
|
61 |
+
mov.b32 %r10, %f3;
|
62 |
+
shfl.sync.bfly.b32 %r11, %r10, 2, 31, -1;
|
63 |
+
mov.b32 %f4, %r11;
|
64 |
+
$L__tmp4:
|
65 |
+
.loc 2 233 15
|
66 |
+
add.f32 %f5, %f3, %f4;
|
67 |
+
$L__tmp5:
|
68 |
+
.loc 2 243 36
|
69 |
+
mov.b32 %r12, %f5;
|
70 |
+
shfl.sync.bfly.b32 %r13, %r12, 1, 31, -1;
|
71 |
+
mov.b32 %f6, %r13;
|
72 |
+
$L__tmp6:
|
73 |
+
.loc 2 233 15
|
74 |
+
add.f32 %f7, %f5, %f6;
|
75 |
+
$L__tmp7:
|
76 |
+
.loc 2 243 36
|
77 |
+
cvt.u32.u64 %r14, %rd2;
|
78 |
+
shfl.sync.bfly.b32 %r15, %r14, 4, 31, -1;
|
79 |
+
{ .reg .b32 tmp; mov.b64 {tmp, %r16}, %rd2; }
|
80 |
+
shfl.sync.bfly.b32 %r17, %r16, 4, 31, -1;
|
81 |
+
cvt.u64.u32 %rd9, %r15;
|
82 |
+
cvt.u64.u32 %rd10, %r17;
|
83 |
+
shl.b64 %rd11, %rd10, 32;
|
84 |
+
or.b64 %rd12, %rd9, %rd11;
|
85 |
+
$L__tmp8:
|
86 |
+
.loc 2 233 15
|
87 |
+
add.s64 %rd13, %rd2, %rd12;
|
88 |
+
$L__tmp9:
|
89 |
+
.loc 2 243 36
|
90 |
+
cvt.u32.u64 %r18, %rd13;
|
91 |
+
shfl.sync.bfly.b32 %r19, %r18, 2, 31, -1;
|
92 |
+
{ .reg .b32 tmp; mov.b64 {tmp, %r20}, %rd13; }
|
93 |
+
shfl.sync.bfly.b32 %r21, %r20, 2, 31, -1;
|
94 |
+
cvt.u64.u32 %rd14, %r19;
|
95 |
+
cvt.u64.u32 %rd15, %r21;
|
96 |
+
shl.b64 %rd16, %rd15, 32;
|
97 |
+
or.b64 %rd17, %rd14, %rd16;
|
98 |
+
$L__tmp10:
|
99 |
+
.loc 2 233 15
|
100 |
+
add.s64 %rd18, %rd13, %rd17;
|
101 |
+
$L__tmp11:
|
102 |
+
.loc 2 243 36
|
103 |
+
cvt.u32.u64 %r22, %rd18;
|
104 |
+
shfl.sync.bfly.b32 %r23, %r22, 1, 31, -1;
|
105 |
+
{ .reg .b32 tmp; mov.b64 {tmp, %r24}, %rd18; }
|
106 |
+
shfl.sync.bfly.b32 %r25, %r24, 1, 31, -1;
|
107 |
+
cvt.u64.u32 %rd19, %r23;
|
108 |
+
cvt.u64.u32 %rd20, %r25;
|
109 |
+
shl.b64 %rd21, %rd20, 32;
|
110 |
+
or.b64 %rd22, %rd19, %rd21;
|
111 |
+
$L__tmp12:
|
112 |
+
.loc 2 233 15
|
113 |
+
add.s64 %rd23, %rd18, %rd22;
|
114 |
+
$L__tmp13:
|
115 |
+
.loc 1 36 20
|
116 |
+
cvt.rn.f32.s64 %f8, %rd23;
|
117 |
+
.loc 1 37 19
|
118 |
+
mov.b32 %r4, %f7;
|
119 |
+
mov.b32 %r5, %f8;
|
120 |
+
div.full.f32 %r6, %r4, %r5;
|
121 |
+
.loc 1 38 4
|
122 |
+
bar.sync 0;
|
123 |
+
.loc 1 39 71
|
124 |
+
and.b32 %r26, %r7, 63;
|
125 |
+
setp.eq.s32 %p5, %r26, 0;
|
126 |
+
@%p5 st.global.b32 [ %rd4 + 0 ], { %r6 };
|
127 |
+
.loc 1 39 4
|
128 |
+
ret;
|
129 |
+
$L__tmp14:
|
130 |
+
$L__func_end0:
|
131 |
+
|
132 |
+
}
|
133 |
+
.file 1 "/tmp/torchinductor_root/7z/c7zrzealf5bsn7qskl6y72zb73mh5bzf6uskuswp33lv4y5kk64w.py"
|
134 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
135 |
+
.section .debug_abbrev
|
136 |
+
{
|
137 |
+
.b8 1
|
138 |
+
.b8 17
|
139 |
+
.b8 1
|
140 |
+
.b8 37
|
141 |
+
.b8 8
|
142 |
+
.b8 19
|
143 |
+
.b8 5
|
144 |
+
.b8 3
|
145 |
+
.b8 8
|
146 |
+
.b8 16
|
147 |
+
.b8 6
|
148 |
+
.b8 27
|
149 |
+
.b8 8
|
150 |
+
.b8 180
|
151 |
+
.b8 66
|
152 |
+
.b8 12
|
153 |
+
.b8 17
|
154 |
+
.b8 1
|
155 |
+
.b8 18
|
156 |
+
.b8 1
|
157 |
+
.b8 0
|
158 |
+
.b8 0
|
159 |
+
.b8 2
|
160 |
+
.b8 46
|
161 |
+
.b8 0
|
162 |
+
.b8 135
|
163 |
+
.b8 64
|
164 |
+
.b8 8
|
165 |
+
.b8 3
|
166 |
+
.b8 8
|
167 |
+
.b8 58
|
168 |
+
.b8 11
|
169 |
+
.b8 59
|
170 |
+
.b8 11
|
171 |
+
.b8 63
|
172 |
+
.b8 12
|
173 |
+
.b8 32
|
174 |
+
.b8 11
|
175 |
+
.b8 0
|
176 |
+
.b8 0
|
177 |
+
.b8 3
|
178 |
+
.b8 46
|
179 |
+
.b8 1
|
180 |
+
.b8 17
|
181 |
+
.b8 1
|
182 |
+
.b8 18
|
183 |
+
.b8 1
|
184 |
+
.b8 64
|
185 |
+
.b8 10
|
186 |
+
.b8 49
|
187 |
+
.b8 19
|
188 |
+
.b8 0
|
189 |
+
.b8 0
|
190 |
+
.b8 4
|
191 |
+
.b8 29
|
192 |
+
.b8 0
|
193 |
+
.b8 49
|
194 |
+
.b8 19
|
195 |
+
.b8 17
|
196 |
+
.b8 1
|
197 |
+
.b8 18
|
198 |
+
.b8 1
|
199 |
+
.b8 88
|
200 |
+
.b8 11
|
201 |
+
.b8 89
|
202 |
+
.b8 11
|
203 |
+
.b8 87
|
204 |
+
.b8 11
|
205 |
+
.b8 0
|
206 |
+
.b8 0
|
207 |
+
.b8 5
|
208 |
+
.b8 29
|
209 |
+
.b8 1
|
210 |
+
.b8 49
|
211 |
+
.b8 19
|
212 |
+
.b8 17
|
213 |
+
.b8 1
|
214 |
+
.b8 18
|
215 |
+
.b8 1
|
216 |
+
.b8 88
|
217 |
+
.b8 11
|
218 |
+
.b8 89
|
219 |
+
.b8 11
|
220 |
+
.b8 87
|
221 |
+
.b8 11
|
222 |
+
.b8 0
|
223 |
+
.b8 0
|
224 |
+
.b8 0
|
225 |
+
}
|
226 |
+
.section .debug_info
|
227 |
+
{
|
228 |
+
.b32 333
|
229 |
+
.b8 2
|
230 |
+
.b8 0
|
231 |
+
.b32 .debug_abbrev
|
232 |
+
.b8 8
|
233 |
+
.b8 1
|
234 |
+
.b8 116
|
235 |
+
.b8 114
|
236 |
+
.b8 105
|
237 |
+
.b8 116
|
238 |
+
.b8 111
|
239 |
+
.b8 110
|
240 |
+
.b8 0
|
241 |
+
.b8 2
|
242 |
+
.b8 0
|
243 |
+
.b8 99
|
244 |
+
.b8 55
|
245 |
+
.b8 122
|
246 |
+
.b8 114
|
247 |
+
.b8 122
|
248 |
+
.b8 101
|
249 |
+
.b8 97
|
250 |
+
.b8 108
|
251 |
+
.b8 102
|
252 |
+
.b8 53
|
253 |
+
.b8 98
|
254 |
+
.b8 115
|
255 |
+
.b8 110
|
256 |
+
.b8 55
|
257 |
+
.b8 113
|
258 |
+
.b8 115
|
259 |
+
.b8 107
|
260 |
+
.b8 108
|
261 |
+
.b8 54
|
262 |
+
.b8 121
|
263 |
+
.b8 55
|
264 |
+
.b8 50
|
265 |
+
.b8 122
|
266 |
+
.b8 98
|
267 |
+
.b8 55
|
268 |
+
.b8 51
|
269 |
+
.b8 109
|
270 |
+
.b8 104
|
271 |
+
.b8 53
|
272 |
+
.b8 98
|
273 |
+
.b8 122
|
274 |
+
.b8 102
|
275 |
+
.b8 54
|
276 |
+
.b8 117
|
277 |
+
.b8 115
|
278 |
+
.b8 107
|
279 |
+
.b8 117
|
280 |
+
.b8 115
|
281 |
+
.b8 119
|
282 |
+
.b8 112
|
283 |
+
.b8 51
|
284 |
+
.b8 51
|
285 |
+
.b8 108
|
286 |
+
.b8 118
|
287 |
+
.b8 52
|
288 |
+
.b8 121
|
289 |
+
.b8 53
|
290 |
+
.b8 107
|
291 |
+
.b8 107
|
292 |
+
.b8 54
|
293 |
+
.b8 52
|
294 |
+
.b8 119
|
295 |
+
.b8 46
|
296 |
+
.b8 112
|
297 |
+
.b8 121
|
298 |
+
.b8 0
|
299 |
+
.b32 .debug_line
|
300 |
+
.b8 47
|
301 |
+
.b8 116
|
302 |
+
.b8 109
|
303 |
+
.b8 112
|
304 |
+
.b8 47
|
305 |
+
.b8 116
|
306 |
+
.b8 111
|
307 |
+
.b8 114
|
308 |
+
.b8 99
|
309 |
+
.b8 104
|
310 |
+
.b8 105
|
311 |
+
.b8 110
|
312 |
+
.b8 100
|
313 |
+
.b8 117
|
314 |
+
.b8 99
|
315 |
+
.b8 116
|
316 |
+
.b8 111
|
317 |
+
.b8 114
|
318 |
+
.b8 95
|
319 |
+
.b8 114
|
320 |
+
.b8 111
|
321 |
+
.b8 111
|
322 |
+
.b8 116
|
323 |
+
.b8 47
|
324 |
+
.b8 55
|
325 |
+
.b8 122
|
326 |
+
.b8 0
|
327 |
+
.b8 1
|
328 |
+
.b64 $L__func_begin0
|
329 |
+
.b64 $L__func_end0
|
330 |
+
.b8 2
|
331 |
+
.b8 116
|
332 |
+
.b8 114
|
333 |
+
.b8 105
|
334 |
+
.b8 116
|
335 |
+
.b8 111
|
336 |
+
.b8 110
|
337 |
+
.b8 95
|
338 |
+
.b8 95
|
339 |
+
.b8 48
|
340 |
+
.b8 100
|
341 |
+
.b8 49
|
342 |
+
.b8 100
|
343 |
+
.b8 50
|
344 |
+
.b8 100
|
345 |
+
.b8 51
|
346 |
+
.b8 52
|
347 |
+
.b8 101
|
348 |
+
.b8 0
|
349 |
+
.b8 116
|
350 |
+
.b8 114
|
351 |
+
.b8 105
|
352 |
+
.b8 116
|
353 |
+
.b8 111
|
354 |
+
.b8 110
|
355 |
+
.b8 95
|
356 |
+
.b8 95
|
357 |
+
.b8 48
|
358 |
+
.b8 100
|
359 |
+
.b8 49
|
360 |
+
.b8 100
|
361 |
+
.b8 50
|
362 |
+
.b8 100
|
363 |
+
.b8 51
|
364 |
+
.b8 52
|
365 |
+
.b8 101
|
366 |
+
.b8 0
|
367 |
+
.b8 1
|
368 |
+
.b8 18
|
369 |
+
.b8 1
|
370 |
+
.b8 1
|
371 |
+
.b8 3
|
372 |
+
.b64 $L__func_begin0
|
373 |
+
.b64 $L__func_end0
|
374 |
+
.b8 1
|
375 |
+
.b8 156
|
376 |
+
.b32 125
|
377 |
+
.b8 4
|
378 |
+
.b32 125
|
379 |
+
.b64 $L__tmp1
|
380 |
+
.b64 $L__tmp6
|
381 |
+
.b8 2
|
382 |
+
.b8 32
|
383 |
+
.b8 24
|
384 |
+
.b8 5
|
385 |
+
.b32 125
|
386 |
+
.b64 $L__tmp2
|
387 |
+
.b64 $L__tmp7
|
388 |
+
.b8 2
|
389 |
+
.b8 32
|
390 |
+
.b8 24
|
391 |
+
.b8 4
|
392 |
+
.b32 125
|
393 |
+
.b64 $L__tmp2
|
394 |
+
.b64 $L__tmp7
|
395 |
+
.b8 2
|
396 |
+
.b8 243
|
397 |
+
.b8 36
|
398 |
+
.b8 0
|
399 |
+
.b8 4
|
400 |
+
.b32 125
|
401 |
+
.b64 $L__tmp7
|
402 |
+
.b64 $L__tmp12
|
403 |
+
.b8 2
|
404 |
+
.b8 35
|
405 |
+
.b8 24
|
406 |
+
.b8 5
|
407 |
+
.b32 125
|
408 |
+
.b64 $L__tmp8
|
409 |
+
.b64 $L__tmp13
|
410 |
+
.b8 2
|
411 |
+
.b8 35
|
412 |
+
.b8 24
|
413 |
+
.b8 4
|
414 |
+
.b32 125
|
415 |
+
.b64 $L__tmp8
|
416 |
+
.b64 $L__tmp13
|
417 |
+
.b8 2
|
418 |
+
.b8 243
|
419 |
+
.b8 36
|
420 |
+
.b8 0
|
421 |
+
.b8 0
|
422 |
+
.b8 0
|
423 |
+
}
|
424 |
+
.section .debug_pubnames
|
425 |
+
{
|
426 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
427 |
+
$L__pubNames_start0:
|
428 |
+
.b8 2
|
429 |
+
.b8 0
|
430 |
+
.b32 .debug_info
|
431 |
+
.b32 337
|
432 |
+
.b32 125
|
433 |
+
.b8 116
|
434 |
+
.b8 114
|
435 |
+
.b8 105
|
436 |
+
.b8 116
|
437 |
+
.b8 111
|
438 |
+
.b8 110
|
439 |
+
.b8 95
|
440 |
+
.b8 95
|
441 |
+
.b8 48
|
442 |
+
.b8 100
|
443 |
+
.b8 49
|
444 |
+
.b8 100
|
445 |
+
.b8 50
|
446 |
+
.b8 100
|
447 |
+
.b8 51
|
448 |
+
.b8 52
|
449 |
+
.b8 101
|
450 |
+
.b8 0
|
451 |
+
.b32 0
|
452 |
+
$L__pubNames_end0:
|
453 |
+
}
|
454 |
+
.section .debug_pubtypes
|
455 |
+
{
|
456 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
457 |
+
$L__pubTypes_start0:
|
458 |
+
.b8 2
|
459 |
+
.b8 0
|
460 |
+
.b32 .debug_info
|
461 |
+
.b32 337
|
462 |
+
.b32 0
|
463 |
+
$L__pubTypes_end0:
|
464 |
+
}
|
465 |
+
.section .debug_loc { }
|
.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.cubin
ADDED
Binary file (29.9 kB). View file
|
|
.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.llir
ADDED
@@ -0,0 +1,424 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
|
6 |
+
define void @triton__0d1d2d3d4d5d6d7de8(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i64 %7, i64 %8) local_unnamed_addr !dbg !5 {
|
7 |
+
%10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
8 |
+
%11 = lshr i32 %10, 5, !dbg !8
|
9 |
+
%urem = and i32 %10, 255, !dbg !8
|
10 |
+
%12 = or i32 %urem, 256, !dbg !8
|
11 |
+
%13 = or i32 %urem, 512, !dbg !8
|
12 |
+
%14 = or i32 %urem, 768, !dbg !8
|
13 |
+
%15 = or i32 %urem, 1024, !dbg !8
|
14 |
+
%16 = or i32 %urem, 1280, !dbg !8
|
15 |
+
%17 = or i32 %urem, 1536, !dbg !8
|
16 |
+
%18 = or i32 %urem, 1792, !dbg !8
|
17 |
+
%19 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9
|
18 |
+
%20 = sext i32 %19 to i64, !dbg !10
|
19 |
+
%21 = insertelement <8 x i32> poison, i32 %urem, i64 0
|
20 |
+
%22 = insertelement <8 x i32> %21, i32 %12, i64 1
|
21 |
+
%23 = insertelement <8 x i32> %22, i32 %13, i64 2
|
22 |
+
%24 = insertelement <8 x i32> %23, i32 %14, i64 3
|
23 |
+
%25 = insertelement <8 x i32> %24, i32 %15, i64 4
|
24 |
+
%26 = insertelement <8 x i32> %25, i32 %16, i64 5
|
25 |
+
%27 = insertelement <8 x i32> %26, i32 %17, i64 6
|
26 |
+
%28 = insertelement <8 x i32> %27, i32 %18, i64 7
|
27 |
+
%29 = zext <8 x i32> %28 to <8 x i64>
|
28 |
+
%30 = getelementptr i64, ptr addrspace(1) %1, i64 %20, !dbg !11
|
29 |
+
%31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %30, i1 true) #3, !dbg !12
|
30 |
+
%32 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %2, i1 true) #3, !dbg !13
|
31 |
+
%33 = bitcast i32 %32 to float, !dbg !13
|
32 |
+
%34 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %3, i1 true) #3, !dbg !14
|
33 |
+
%35 = bitcast i32 %34 to float, !dbg !14
|
34 |
+
%36 = mul nsw i64 %20, 50257, !dbg !15
|
35 |
+
%.not = icmp eq i64 %31, -1, !dbg !16
|
36 |
+
%37 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %33, float %35) #3, !dbg !17
|
37 |
+
%38 = select i1 %.not, float 0.000000e+00, float %37, !dbg !18
|
38 |
+
%invariant.gep = getelementptr float, ptr addrspace(1) %0, i64 %36, !dbg !19
|
39 |
+
%39 = insertelement <8 x float> poison, float %38, i64 0, !dbg !20
|
40 |
+
%40 = shufflevector <8 x float> %39, <8 x float> poison, <8 x i32> zeroinitializer, !dbg !20
|
41 |
+
br label %41, !dbg !19
|
42 |
+
|
43 |
+
41: ; preds = %9, %41
|
44 |
+
%42 = phi i32 [ 0, %9 ], [ %85, %41 ]
|
45 |
+
%43 = phi <8 x float> [ zeroinitializer, %9 ], [ %84, %41 ]
|
46 |
+
%44 = zext nneg i32 %42 to i64, !dbg !21
|
47 |
+
%45 = insertelement <8 x i64> poison, i64 %44, i64 0, !dbg !21
|
48 |
+
%46 = shufflevector <8 x i64> %45, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !21
|
49 |
+
%47 = or <8 x i64> %46, %29, !dbg !21
|
50 |
+
%48 = icmp ult <8 x i64> %47, <i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !22
|
51 |
+
%49 = extractelement <8 x i64> %47, i64 0, !dbg !23
|
52 |
+
%gep = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %49, !dbg !23
|
53 |
+
%50 = extractelement <8 x i64> %47, i64 1, !dbg !23
|
54 |
+
%gep3 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %50, !dbg !23
|
55 |
+
%51 = extractelement <8 x i64> %47, i64 2, !dbg !23
|
56 |
+
%gep5 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %51, !dbg !23
|
57 |
+
%52 = extractelement <8 x i64> %47, i64 3, !dbg !23
|
58 |
+
%gep7 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %52, !dbg !23
|
59 |
+
%53 = extractelement <8 x i64> %47, i64 4, !dbg !23
|
60 |
+
%gep9 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %53, !dbg !23
|
61 |
+
%54 = extractelement <8 x i64> %47, i64 5, !dbg !23
|
62 |
+
%gep11 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %54, !dbg !23
|
63 |
+
%55 = extractelement <8 x i64> %47, i64 6, !dbg !23
|
64 |
+
%gep13 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %55, !dbg !23
|
65 |
+
%56 = extractelement <8 x i64> %47, i64 7, !dbg !23
|
66 |
+
%gep15 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %56, !dbg !23
|
67 |
+
%57 = extractelement <8 x i1> %48, i64 0, !dbg !24
|
68 |
+
%58 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep, i1 %57, i32 0, i1 %57) #3, !dbg !24
|
69 |
+
%59 = extractelement <8 x i1> %48, i64 1, !dbg !24
|
70 |
+
%60 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep3, i1 %59, i32 0, i1 %59) #3, !dbg !24
|
71 |
+
%61 = extractelement <8 x i1> %48, i64 2, !dbg !24
|
72 |
+
%62 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep5, i1 %61, i32 0, i1 %61) #3, !dbg !24
|
73 |
+
%63 = extractelement <8 x i1> %48, i64 3, !dbg !24
|
74 |
+
%64 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep7, i1 %63, i32 0, i1 %63) #3, !dbg !24
|
75 |
+
%65 = extractelement <8 x i1> %48, i64 4, !dbg !24
|
76 |
+
%66 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep9, i1 %65, i32 0, i1 %65) #3, !dbg !24
|
77 |
+
%67 = extractelement <8 x i1> %48, i64 5, !dbg !24
|
78 |
+
%68 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep11, i1 %67, i32 0, i1 %67) #3, !dbg !24
|
79 |
+
%69 = extractelement <8 x i1> %48, i64 6, !dbg !24
|
80 |
+
%70 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep13, i1 %69, i32 0, i1 %69) #3, !dbg !24
|
81 |
+
%71 = extractelement <8 x i1> %48, i64 7, !dbg !24
|
82 |
+
%72 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep15, i1 %71, i32 0, i1 %71) #3, !dbg !24
|
83 |
+
%73 = insertelement <8 x i32> poison, i32 %58, i64 0, !dbg !24
|
84 |
+
%74 = insertelement <8 x i32> %73, i32 %60, i64 1, !dbg !24
|
85 |
+
%75 = insertelement <8 x i32> %74, i32 %62, i64 2, !dbg !24
|
86 |
+
%76 = insertelement <8 x i32> %75, i32 %64, i64 3, !dbg !24
|
87 |
+
%77 = insertelement <8 x i32> %76, i32 %66, i64 4, !dbg !24
|
88 |
+
%78 = insertelement <8 x i32> %77, i32 %68, i64 5, !dbg !24
|
89 |
+
%79 = insertelement <8 x i32> %78, i32 %70, i64 6, !dbg !24
|
90 |
+
%80 = insertelement <8 x i32> %79, i32 %72, i64 7, !dbg !24
|
91 |
+
%81 = bitcast <8 x i32> %80 to <8 x float>, !dbg !24
|
92 |
+
%82 = fmul <8 x float> %40, %81, !dbg !20
|
93 |
+
%83 = select <8 x i1> %48, <8 x float> %82, <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, !dbg !25
|
94 |
+
%84 = fadd <8 x float> %43, %83, !dbg !25
|
95 |
+
%85 = add nuw nsw i32 %42, 2048, !dbg !19
|
96 |
+
%86 = icmp ult i32 %42, 48209, !dbg !19
|
97 |
+
br i1 %86, label %41, label %87, !dbg !19
|
98 |
+
|
99 |
+
87: ; preds = %41
|
100 |
+
%88 = and i32 %10, 31, !dbg !8
|
101 |
+
%89 = and i32 %11, 7, !dbg !8
|
102 |
+
%shift = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
|
103 |
+
%90 = fadd <8 x float> %84, %shift, !dbg !26
|
104 |
+
%shift37 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
|
105 |
+
%91 = fadd <8 x float> %shift37, %90, !dbg !26
|
106 |
+
%shift38 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
|
107 |
+
%92 = fadd <8 x float> %shift38, %91, !dbg !26
|
108 |
+
%shift39 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
|
109 |
+
%93 = fadd <8 x float> %shift39, %92, !dbg !26
|
110 |
+
%shift40 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
|
111 |
+
%94 = fadd <8 x float> %shift40, %93, !dbg !26
|
112 |
+
%shift41 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
|
113 |
+
%95 = fadd <8 x float> %shift41, %94, !dbg !26
|
114 |
+
%shift42 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
|
115 |
+
%96 = fadd <8 x float> %shift42, %95, !dbg !26
|
116 |
+
%97 = extractelement <8 x float> %96, i64 0, !dbg !26
|
117 |
+
%98 = bitcast float %97 to i32, !dbg !32
|
118 |
+
%99 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %98, i32 16, i32 31), !dbg !32
|
119 |
+
%100 = bitcast i32 %99 to float, !dbg !32
|
120 |
+
%101 = fadd float %97, %100, !dbg !26
|
121 |
+
%102 = bitcast float %101 to i32, !dbg !32
|
122 |
+
%103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 8, i32 31), !dbg !32
|
123 |
+
%104 = bitcast i32 %103 to float, !dbg !32
|
124 |
+
%105 = fadd float %101, %104, !dbg !26
|
125 |
+
%106 = bitcast float %105 to i32, !dbg !32
|
126 |
+
%107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 4, i32 31), !dbg !32
|
127 |
+
%108 = bitcast i32 %107 to float, !dbg !32
|
128 |
+
%109 = fadd float %105, %108, !dbg !26
|
129 |
+
%110 = bitcast float %109 to i32, !dbg !32
|
130 |
+
%111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 2, i32 31), !dbg !32
|
131 |
+
%112 = bitcast i32 %111 to float, !dbg !32
|
132 |
+
%113 = fadd float %109, %112, !dbg !26
|
133 |
+
%114 = bitcast float %113 to i32, !dbg !32
|
134 |
+
%115 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %114, i32 1, i32 31), !dbg !32
|
135 |
+
%116 = bitcast i32 %115 to float, !dbg !32
|
136 |
+
%117 = fadd float %113, %116, !dbg !26
|
137 |
+
%118 = icmp eq i32 %88, 0, !dbg !32
|
138 |
+
%119 = zext nneg i32 %89 to i64, !dbg !32
|
139 |
+
%120 = getelementptr float, ptr addrspace(3) @global_smem, i64 %119, !dbg !32
|
140 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %120, float %117, i1 %118) #3, !dbg !32
|
141 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !32
|
142 |
+
%121 = icmp slt i32 %10, 8, !dbg !32
|
143 |
+
%122 = sext i32 %10 to i64, !dbg !32
|
144 |
+
%123 = getelementptr float, ptr addrspace(3) @global_smem, i64 %122, !dbg !32
|
145 |
+
%124 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %123, i1 %121) #3, !dbg !32
|
146 |
+
%125 = bitcast float %124 to i32, !dbg !32
|
147 |
+
%126 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %125, i32 4, i32 31), !dbg !32
|
148 |
+
%127 = bitcast i32 %126 to float, !dbg !32
|
149 |
+
%128 = fadd float %124, %127, !dbg !26
|
150 |
+
%129 = bitcast float %128 to i32, !dbg !32
|
151 |
+
%130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 2, i32 31), !dbg !32
|
152 |
+
%131 = bitcast i32 %130 to float, !dbg !32
|
153 |
+
%132 = fadd float %128, %131, !dbg !26
|
154 |
+
%133 = bitcast float %132 to i32, !dbg !32
|
155 |
+
%134 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %133, i32 1, i32 31), !dbg !32
|
156 |
+
%135 = bitcast i32 %134 to float, !dbg !32
|
157 |
+
%136 = fadd float %132, %135, !dbg !26
|
158 |
+
%137 = and i32 %10, 7, !dbg !32
|
159 |
+
%138 = icmp eq i32 %137, 0, !dbg !32
|
160 |
+
%139 = and i1 %121, %138, !dbg !32
|
161 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %123, float %136, i1 %139) #3, !dbg !32
|
162 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !32
|
163 |
+
%140 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32
|
164 |
+
%141 = extractelement <8 x i64> %29, i64 0, !dbg !34
|
165 |
+
%142 = extractelement <8 x i64> %29, i64 1, !dbg !34
|
166 |
+
%143 = extractelement <8 x i64> %29, i64 2, !dbg !34
|
167 |
+
%144 = extractelement <8 x i64> %29, i64 3, !dbg !34
|
168 |
+
%145 = extractelement <8 x i64> %29, i64 4, !dbg !34
|
169 |
+
%146 = extractelement <8 x i64> %29, i64 5, !dbg !34
|
170 |
+
%147 = extractelement <8 x i64> %29, i64 6, !dbg !34
|
171 |
+
%148 = extractelement <8 x i64> %29, i64 7, !dbg !34
|
172 |
+
br label %149, !dbg !35
|
173 |
+
|
174 |
+
149: ; preds = %87, %149
|
175 |
+
%150 = phi i32 [ 0, %87 ], [ %312, %149 ]
|
176 |
+
%151 = zext nneg i32 %150 to i64, !dbg !34
|
177 |
+
%152 = or i64 %141, %151, !dbg !34
|
178 |
+
%153 = or i64 %142, %151, !dbg !34
|
179 |
+
%154 = or i64 %143, %151, !dbg !34
|
180 |
+
%155 = or i64 %144, %151, !dbg !34
|
181 |
+
%156 = or i64 %145, %151, !dbg !34
|
182 |
+
%157 = or i64 %146, %151, !dbg !34
|
183 |
+
%158 = or i64 %147, %151, !dbg !34
|
184 |
+
%159 = or i64 %148, %151, !dbg !34
|
185 |
+
%160 = icmp ult i64 %152, 50257, !dbg !36
|
186 |
+
%161 = icmp ult i64 %153, 50257, !dbg !36
|
187 |
+
%162 = icmp ult i64 %154, 50257, !dbg !36
|
188 |
+
%163 = icmp ult i64 %155, 50257, !dbg !36
|
189 |
+
%164 = icmp ult i64 %156, 50257, !dbg !36
|
190 |
+
%165 = icmp ult i64 %157, 50257, !dbg !36
|
191 |
+
%166 = icmp ult i64 %158, 50257, !dbg !36
|
192 |
+
%167 = icmp ult i64 %159, 50257, !dbg !36
|
193 |
+
%168 = add nsw i64 %152, %36, !dbg !37
|
194 |
+
%169 = add nsw i64 %153, %36, !dbg !37
|
195 |
+
%170 = add nsw i64 %154, %36, !dbg !37
|
196 |
+
%171 = add nsw i64 %155, %36, !dbg !37
|
197 |
+
%172 = add nsw i64 %156, %36, !dbg !37
|
198 |
+
%173 = add nsw i64 %157, %36, !dbg !37
|
199 |
+
%174 = add nsw i64 %158, %36, !dbg !37
|
200 |
+
%175 = add nsw i64 %159, %36, !dbg !37
|
201 |
+
%176 = getelementptr i16, ptr addrspace(1) %4, i64 %168, !dbg !38
|
202 |
+
%177 = getelementptr i16, ptr addrspace(1) %4, i64 %169, !dbg !38
|
203 |
+
%178 = getelementptr i16, ptr addrspace(1) %4, i64 %170, !dbg !38
|
204 |
+
%179 = getelementptr i16, ptr addrspace(1) %4, i64 %171, !dbg !38
|
205 |
+
%180 = getelementptr i16, ptr addrspace(1) %4, i64 %172, !dbg !38
|
206 |
+
%181 = getelementptr i16, ptr addrspace(1) %4, i64 %173, !dbg !38
|
207 |
+
%182 = getelementptr i16, ptr addrspace(1) %4, i64 %174, !dbg !38
|
208 |
+
%183 = getelementptr i16, ptr addrspace(1) %4, i64 %175, !dbg !38
|
209 |
+
%184 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %176, i1 %160, i16 0, i1 %160) #3, !dbg !39
|
210 |
+
%185 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %177, i1 %161, i16 0, i1 %161) #3, !dbg !39
|
211 |
+
%186 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %178, i1 %162, i16 0, i1 %162) #3, !dbg !39
|
212 |
+
%187 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %179, i1 %163, i16 0, i1 %163) #3, !dbg !39
|
213 |
+
%188 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %180, i1 %164, i16 0, i1 %164) #3, !dbg !39
|
214 |
+
%189 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %181, i1 %165, i16 0, i1 %165) #3, !dbg !39
|
215 |
+
%190 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %182, i1 %166, i16 0, i1 %166) #3, !dbg !39
|
216 |
+
%191 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %183, i1 %167, i16 0, i1 %167) #3, !dbg !39
|
217 |
+
%192 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %184) #3, !dbg !40
|
218 |
+
%193 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %185) #3, !dbg !40
|
219 |
+
%194 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %186) #3, !dbg !40
|
220 |
+
%195 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %187) #3, !dbg !40
|
221 |
+
%196 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %188) #3, !dbg !40
|
222 |
+
%197 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %189) #3, !dbg !40
|
223 |
+
%198 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %190) #3, !dbg !40
|
224 |
+
%199 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %191) #3, !dbg !40
|
225 |
+
%200 = getelementptr float, ptr addrspace(1) %0, i64 %168, !dbg !41
|
226 |
+
%201 = getelementptr float, ptr addrspace(1) %0, i64 %169, !dbg !41
|
227 |
+
%202 = getelementptr float, ptr addrspace(1) %0, i64 %170, !dbg !41
|
228 |
+
%203 = getelementptr float, ptr addrspace(1) %0, i64 %171, !dbg !41
|
229 |
+
%204 = getelementptr float, ptr addrspace(1) %0, i64 %172, !dbg !41
|
230 |
+
%205 = getelementptr float, ptr addrspace(1) %0, i64 %173, !dbg !41
|
231 |
+
%206 = getelementptr float, ptr addrspace(1) %0, i64 %174, !dbg !41
|
232 |
+
%207 = getelementptr float, ptr addrspace(1) %0, i64 %175, !dbg !41
|
233 |
+
%208 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %200, i1 %160, i32 0, i1 %160) #3, !dbg !42
|
234 |
+
%209 = bitcast i32 %208 to float, !dbg !42
|
235 |
+
%210 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %201, i1 %161, i32 0, i1 %161) #3, !dbg !42
|
236 |
+
%211 = bitcast i32 %210 to float, !dbg !42
|
237 |
+
%212 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %202, i1 %162, i32 0, i1 %162) #3, !dbg !42
|
238 |
+
%213 = bitcast i32 %212 to float, !dbg !42
|
239 |
+
%214 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %203, i1 %163, i32 0, i1 %163) #3, !dbg !42
|
240 |
+
%215 = bitcast i32 %214 to float, !dbg !42
|
241 |
+
%216 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %204, i1 %164, i32 0, i1 %164) #3, !dbg !42
|
242 |
+
%217 = bitcast i32 %216 to float, !dbg !42
|
243 |
+
%218 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %205, i1 %165, i32 0, i1 %165) #3, !dbg !42
|
244 |
+
%219 = bitcast i32 %218 to float, !dbg !42
|
245 |
+
%220 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %206, i1 %166, i32 0, i1 %166) #3, !dbg !42
|
246 |
+
%221 = bitcast i32 %220 to float, !dbg !42
|
247 |
+
%222 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %207, i1 %167, i32 0, i1 %167) #3, !dbg !42
|
248 |
+
%223 = bitcast i32 %222 to float, !dbg !42
|
249 |
+
%224 = getelementptr i16, ptr addrspace(1) %5, i64 %168, !dbg !43
|
250 |
+
%225 = getelementptr i16, ptr addrspace(1) %5, i64 %169, !dbg !43
|
251 |
+
%226 = getelementptr i16, ptr addrspace(1) %5, i64 %170, !dbg !43
|
252 |
+
%227 = getelementptr i16, ptr addrspace(1) %5, i64 %171, !dbg !43
|
253 |
+
%228 = getelementptr i16, ptr addrspace(1) %5, i64 %172, !dbg !43
|
254 |
+
%229 = getelementptr i16, ptr addrspace(1) %5, i64 %173, !dbg !43
|
255 |
+
%230 = getelementptr i16, ptr addrspace(1) %5, i64 %174, !dbg !43
|
256 |
+
%231 = getelementptr i16, ptr addrspace(1) %5, i64 %175, !dbg !43
|
257 |
+
%232 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %224, i1 %160, i16 0, i1 %160) #3, !dbg !44
|
258 |
+
%233 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %225, i1 %161, i16 0, i1 %161) #3, !dbg !44
|
259 |
+
%234 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %226, i1 %162, i16 0, i1 %162) #3, !dbg !44
|
260 |
+
%235 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %227, i1 %163, i16 0, i1 %163) #3, !dbg !44
|
261 |
+
%236 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %228, i1 %164, i16 0, i1 %164) #3, !dbg !44
|
262 |
+
%237 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %229, i1 %165, i16 0, i1 %165) #3, !dbg !44
|
263 |
+
%238 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %230, i1 %166, i16 0, i1 %166) #3, !dbg !44
|
264 |
+
%239 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %231, i1 %167, i16 0, i1 %167) #3, !dbg !44
|
265 |
+
%240 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %232) #3, !dbg !45
|
266 |
+
%241 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %233) #3, !dbg !45
|
267 |
+
%242 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %234) #3, !dbg !45
|
268 |
+
%243 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %235) #3, !dbg !45
|
269 |
+
%244 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %236) #3, !dbg !45
|
270 |
+
%245 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %237) #3, !dbg !45
|
271 |
+
%246 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %238) #3, !dbg !45
|
272 |
+
%247 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %239) #3, !dbg !45
|
273 |
+
%248 = fmul float %38, %209, !dbg !46
|
274 |
+
%249 = fmul float %38, %211, !dbg !46
|
275 |
+
%250 = fmul float %38, %213, !dbg !46
|
276 |
+
%251 = fmul float %38, %215, !dbg !46
|
277 |
+
%252 = fmul float %38, %217, !dbg !46
|
278 |
+
%253 = fmul float %38, %219, !dbg !46
|
279 |
+
%254 = fmul float %38, %221, !dbg !46
|
280 |
+
%255 = fmul float %38, %223, !dbg !46
|
281 |
+
%256 = fmul float %240, 0x3FF7154760000000, !dbg !47
|
282 |
+
%257 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %256) #3, !dbg !47
|
283 |
+
%258 = fmul float %241, 0x3FF7154760000000, !dbg !47
|
284 |
+
%259 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %258) #3, !dbg !47
|
285 |
+
%260 = fmul float %242, 0x3FF7154760000000, !dbg !47
|
286 |
+
%261 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %260) #3, !dbg !47
|
287 |
+
%262 = fmul float %243, 0x3FF7154760000000, !dbg !47
|
288 |
+
%263 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %262) #3, !dbg !47
|
289 |
+
%264 = fmul float %244, 0x3FF7154760000000, !dbg !47
|
290 |
+
%265 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %264) #3, !dbg !47
|
291 |
+
%266 = fmul float %245, 0x3FF7154760000000, !dbg !47
|
292 |
+
%267 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %266) #3, !dbg !47
|
293 |
+
%268 = fmul float %246, 0x3FF7154760000000, !dbg !47
|
294 |
+
%269 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %268) #3, !dbg !47
|
295 |
+
%270 = fmul float %247, 0x3FF7154760000000, !dbg !47
|
296 |
+
%271 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %270) #3, !dbg !47
|
297 |
+
%272 = fmul float %140, %257, !dbg !48
|
298 |
+
%273 = fmul float %140, %259, !dbg !48
|
299 |
+
%274 = fmul float %140, %261, !dbg !48
|
300 |
+
%275 = fmul float %140, %263, !dbg !48
|
301 |
+
%276 = fmul float %140, %265, !dbg !48
|
302 |
+
%277 = fmul float %140, %267, !dbg !48
|
303 |
+
%278 = fmul float %140, %269, !dbg !48
|
304 |
+
%279 = fmul float %140, %271, !dbg !48
|
305 |
+
%280 = fsub float %248, %272, !dbg !49
|
306 |
+
%281 = fsub float %249, %273, !dbg !49
|
307 |
+
%282 = fsub float %250, %274, !dbg !49
|
308 |
+
%283 = fsub float %251, %275, !dbg !49
|
309 |
+
%284 = fsub float %252, %276, !dbg !49
|
310 |
+
%285 = fsub float %253, %277, !dbg !49
|
311 |
+
%286 = fsub float %254, %278, !dbg !49
|
312 |
+
%287 = fsub float %255, %279, !dbg !49
|
313 |
+
%288 = fadd float %192, %280, !dbg !50
|
314 |
+
%289 = fadd float %193, %281, !dbg !50
|
315 |
+
%290 = fadd float %194, %282, !dbg !50
|
316 |
+
%291 = fadd float %195, %283, !dbg !50
|
317 |
+
%292 = fadd float %196, %284, !dbg !50
|
318 |
+
%293 = fadd float %197, %285, !dbg !50
|
319 |
+
%294 = fadd float %198, %286, !dbg !50
|
320 |
+
%295 = fadd float %199, %287, !dbg !50
|
321 |
+
%296 = getelementptr i16, ptr addrspace(1) %6, i64 %168, !dbg !51
|
322 |
+
%297 = getelementptr i16, ptr addrspace(1) %6, i64 %169, !dbg !51
|
323 |
+
%298 = getelementptr i16, ptr addrspace(1) %6, i64 %170, !dbg !51
|
324 |
+
%299 = getelementptr i16, ptr addrspace(1) %6, i64 %171, !dbg !51
|
325 |
+
%300 = getelementptr i16, ptr addrspace(1) %6, i64 %172, !dbg !51
|
326 |
+
%301 = getelementptr i16, ptr addrspace(1) %6, i64 %173, !dbg !51
|
327 |
+
%302 = getelementptr i16, ptr addrspace(1) %6, i64 %174, !dbg !51
|
328 |
+
%303 = getelementptr i16, ptr addrspace(1) %6, i64 %175, !dbg !51
|
329 |
+
%304 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %288) #3, !dbg !52
|
330 |
+
%305 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %289) #3, !dbg !52
|
331 |
+
%306 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %290) #3, !dbg !52
|
332 |
+
%307 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %291) #3, !dbg !52
|
333 |
+
%308 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %292) #3, !dbg !52
|
334 |
+
%309 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %293) #3, !dbg !52
|
335 |
+
%310 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %294) #3, !dbg !52
|
336 |
+
%311 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %295) #3, !dbg !52
|
337 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %304, ptr addrspace(1) %296, i1 %160) #3, !dbg !52
|
338 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %305, ptr addrspace(1) %297, i1 %161) #3, !dbg !52
|
339 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %306, ptr addrspace(1) %298, i1 %162) #3, !dbg !52
|
340 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %307, ptr addrspace(1) %299, i1 %163) #3, !dbg !52
|
341 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %308, ptr addrspace(1) %300, i1 %164) #3, !dbg !52
|
342 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %309, ptr addrspace(1) %301, i1 %165) #3, !dbg !52
|
343 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %310, ptr addrspace(1) %302, i1 %166) #3, !dbg !52
|
344 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %311, ptr addrspace(1) %303, i1 %167) #3, !dbg !52
|
345 |
+
%312 = add nuw nsw i32 %150, 2048, !dbg !35
|
346 |
+
%313 = icmp ult i32 %150, 48209, !dbg !35
|
347 |
+
br i1 %313, label %149, label %314, !dbg !35
|
348 |
+
|
349 |
+
314: ; preds = %149
|
350 |
+
ret void, !dbg !53
|
351 |
+
}
|
352 |
+
|
353 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
354 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
355 |
+
|
356 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
357 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
358 |
+
|
359 |
+
; Function Attrs: convergent nocallback nounwind
|
360 |
+
declare void @llvm.nvvm.barrier0() #2
|
361 |
+
|
362 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
363 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
364 |
+
attributes #2 = { convergent nocallback nounwind }
|
365 |
+
attributes #3 = { nounwind }
|
366 |
+
|
367 |
+
!llvm.module.flags = !{!0}
|
368 |
+
!llvm.dbg.cu = !{!1}
|
369 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
370 |
+
|
371 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
372 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
373 |
+
!2 = !DIFile(filename: "ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py", directory: "/tmp/torchinductor_root/kz")
|
374 |
+
!3 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"kernel", i32 1}
|
375 |
+
!4 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"maxntidx", i32 256}
|
376 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8", linkageName: "triton__0d1d2d3d4d5d6d7de8", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
377 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
378 |
+
!7 = !{}
|
379 |
+
!8 = !DILocation(line: 24, column: 33, scope: !5)
|
380 |
+
!9 = !DILocation(line: 21, column: 28, scope: !5)
|
381 |
+
!10 = !DILocation(line: 21, column: 34, scope: !5)
|
382 |
+
!11 = !DILocation(line: 26, column: 30, scope: !5)
|
383 |
+
!12 = !DILocation(line: 26, column: 35, scope: !5)
|
384 |
+
!13 = !DILocation(line: 27, column: 19, scope: !5)
|
385 |
+
!14 = !DILocation(line: 29, column: 19, scope: !5)
|
386 |
+
!15 = !DILocation(line: 36, column: 46, scope: !5)
|
387 |
+
!16 = !DILocation(line: 38, column: 23, scope: !5)
|
388 |
+
!17 = !DILocation(line: 39, column: 22, scope: !5)
|
389 |
+
!18 = !DILocation(line: 41, column: 37, scope: !5)
|
390 |
+
!19 = !DILocation(line: 32, column: 36, scope: !5)
|
391 |
+
!20 = !DILocation(line: 42, column: 23, scope: !5)
|
392 |
+
!21 = !DILocation(line: 33, column: 27, scope: !5)
|
393 |
+
!22 = !DILocation(line: 34, column: 25, scope: !5)
|
394 |
+
!23 = !DILocation(line: 36, column: 34, scope: !5)
|
395 |
+
!24 = !DILocation(line: 36, column: 52, scope: !5)
|
396 |
+
!25 = !DILocation(line: 45, column: 40, scope: !5)
|
397 |
+
!26 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !30)
|
398 |
+
!27 = distinct !DILexicalBlockFile(scope: !29, file: !28, discriminator: 0)
|
399 |
+
!28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
400 |
+
!29 = distinct !DILexicalBlockFile(scope: !5, file: !28, discriminator: 0)
|
401 |
+
!30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31)
|
402 |
+
!31 = !DILocation(line: 46, column: 27, scope: !27)
|
403 |
+
!32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33)
|
404 |
+
!33 = !DILocation(line: 46, column: 27, scope: !29)
|
405 |
+
!34 = !DILocation(line: 52, column: 27, scope: !5)
|
406 |
+
!35 = !DILocation(line: 51, column: 36, scope: !5)
|
407 |
+
!36 = !DILocation(line: 53, column: 25, scope: !5)
|
408 |
+
!37 = !DILocation(line: 55, column: 41, scope: !5)
|
409 |
+
!38 = !DILocation(line: 55, column: 35, scope: !5)
|
410 |
+
!39 = !DILocation(line: 55, column: 53, scope: !5)
|
411 |
+
!40 = !DILocation(line: 55, column: 105, scope: !5)
|
412 |
+
!41 = !DILocation(line: 56, column: 35, scope: !5)
|
413 |
+
!42 = !DILocation(line: 56, column: 53, scope: !5)
|
414 |
+
!43 = !DILocation(line: 57, column: 35, scope: !5)
|
415 |
+
!44 = !DILocation(line: 57, column: 53, scope: !5)
|
416 |
+
!45 = !DILocation(line: 57, column: 105, scope: !5)
|
417 |
+
!46 = !DILocation(line: 63, column: 24, scope: !5)
|
418 |
+
!47 = !DILocation(line: 65, column: 23, scope: !5)
|
419 |
+
!48 = !DILocation(line: 66, column: 24, scope: !5)
|
420 |
+
!49 = !DILocation(line: 67, column: 24, scope: !5)
|
421 |
+
!50 = !DILocation(line: 69, column: 24, scope: !5)
|
422 |
+
!51 = !DILocation(line: 70, column: 29, scope: !5)
|
423 |
+
!52 = !DILocation(line: 70, column: 54, scope: !5)
|
424 |
+
!53 = !DILocation(line: 51, column: 4, scope: !5)
|
.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.ptx
ADDED
@@ -0,0 +1,921 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6d7de8
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2d3d4d5d6d7de8(
|
13 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_0,
|
14 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_1,
|
15 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_2,
|
16 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_3,
|
17 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_4,
|
18 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_5,
|
19 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_6,
|
20 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_7,
|
21 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_8
|
22 |
+
)
|
23 |
+
.maxntid 256, 1, 1
|
24 |
+
{
|
25 |
+
.reg .pred %p<83>;
|
26 |
+
.reg .b16 %rs<65>;
|
27 |
+
.reg .b32 %r<104>;
|
28 |
+
.reg .f32 %f<164>;
|
29 |
+
.reg .b64 %rd<126>;
|
30 |
+
.loc 1 18 0
|
31 |
+
$L__func_begin0:
|
32 |
+
.loc 1 18 0
|
33 |
+
|
34 |
+
ld.param.u64 %rd25, [triton__0d1d2d3d4d5d6d7de8_param_6];
|
35 |
+
ld.param.u64 %rd24, [triton__0d1d2d3d4d5d6d7de8_param_5];
|
36 |
+
ld.param.u64 %rd23, [triton__0d1d2d3d4d5d6d7de8_param_4];
|
37 |
+
ld.param.u64 %rd31, [triton__0d1d2d3d4d5d6d7de8_param_0];
|
38 |
+
$L__tmp0:
|
39 |
+
.loc 1 24 33
|
40 |
+
mov.u32 %r1, %tid.x;
|
41 |
+
ld.param.u64 %rd32, [triton__0d1d2d3d4d5d6d7de8_param_1];
|
42 |
+
shr.u32 %r2, %r1, 5;
|
43 |
+
ld.param.u64 %rd28, [triton__0d1d2d3d4d5d6d7de8_param_2];
|
44 |
+
and.b32 %r9, %r1, 255;
|
45 |
+
ld.param.u64 %rd29, [triton__0d1d2d3d4d5d6d7de8_param_3];
|
46 |
+
or.b32 %r10, %r9, 256;
|
47 |
+
or.b32 %r11, %r9, 512;
|
48 |
+
or.b32 %r12, %r9, 768;
|
49 |
+
or.b32 %r13, %r9, 1024;
|
50 |
+
or.b32 %r14, %r9, 1280;
|
51 |
+
or.b32 %r15, %r9, 1536;
|
52 |
+
or.b32 %r16, %r9, 1792;
|
53 |
+
.loc 1 21 28
|
54 |
+
mov.u32 %r3, %ctaid.x;
|
55 |
+
cvt.u64.u32 %rd1, %r9;
|
56 |
+
cvt.u64.u32 %rd8, %r16;
|
57 |
+
cvt.u64.u32 %rd7, %r15;
|
58 |
+
cvt.u64.u32 %rd6, %r14;
|
59 |
+
cvt.u64.u32 %rd5, %r13;
|
60 |
+
cvt.u64.u32 %rd4, %r12;
|
61 |
+
cvt.u64.u32 %rd3, %r11;
|
62 |
+
cvt.u64.u32 %rd2, %r10;
|
63 |
+
.loc 1 26 30
|
64 |
+
mul.wide.s32 %rd33, %r3, 8;
|
65 |
+
add.s64 %rd27, %rd32, %rd33;
|
66 |
+
mov.pred %p1, -1;
|
67 |
+
.loc 1 26 35
|
68 |
+
mov.u64 %rd26, 0x0;
|
69 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd26 }, [ %rd27 + 0 ];
|
70 |
+
.loc 1 27 19
|
71 |
+
mov.u32 %r7, 0x0;
|
72 |
+
@%p1 ld.global.b32 { %r7 }, [ %rd28 + 0 ];
|
73 |
+
.loc 1 29 19
|
74 |
+
mov.u32 %r8, 0x0;
|
75 |
+
@%p1 ld.global.b32 { %r8 }, [ %rd29 + 0 ];
|
76 |
+
.loc 1 36 46
|
77 |
+
mul.wide.s32 %rd9, %r3, 50257;
|
78 |
+
.loc 1 38 23
|
79 |
+
setp.eq.s64 %p4, %rd26, -1;
|
80 |
+
.loc 1 39 22
|
81 |
+
div.full.f32 %r6, %r7, %r8;
|
82 |
+
mov.b32 %f35, %r6;
|
83 |
+
.loc 1 41 37
|
84 |
+
selp.f32 %f2, 0f00000000, %f35, %p4;
|
85 |
+
.loc 1 32 36
|
86 |
+
shl.b64 %rd34, %rd9, 2;
|
87 |
+
add.s64 %rd10, %rd31, %rd34;
|
88 |
+
mov.f32 %f156, 0f00000000;
|
89 |
+
mov.u64 %rd124, 0;
|
90 |
+
mov.f32 %f157, %f156;
|
91 |
+
mov.f32 %f158, %f156;
|
92 |
+
mov.f32 %f159, %f156;
|
93 |
+
mov.f32 %f160, %f156;
|
94 |
+
mov.f32 %f161, %f156;
|
95 |
+
mov.f32 %f162, %f156;
|
96 |
+
mov.f32 %f163, %f156;
|
97 |
+
$L__BB0_1:
|
98 |
+
.loc 1 33 27
|
99 |
+
or.b64 %rd43, %rd124, %rd1;
|
100 |
+
or.b64 %rd44, %rd124, %rd2;
|
101 |
+
or.b64 %rd45, %rd124, %rd3;
|
102 |
+
or.b64 %rd46, %rd124, %rd4;
|
103 |
+
or.b64 %rd47, %rd124, %rd5;
|
104 |
+
or.b64 %rd48, %rd124, %rd6;
|
105 |
+
or.b64 %rd49, %rd124, %rd7;
|
106 |
+
or.b64 %rd50, %rd124, %rd8;
|
107 |
+
.loc 1 34 25
|
108 |
+
setp.lt.u64 %p20, %rd50, 50257;
|
109 |
+
setp.lt.u64 %p18, %rd49, 50257;
|
110 |
+
setp.lt.u64 %p16, %rd48, 50257;
|
111 |
+
setp.lt.u64 %p14, %rd47, 50257;
|
112 |
+
setp.lt.u64 %p12, %rd46, 50257;
|
113 |
+
setp.lt.u64 %p10, %rd45, 50257;
|
114 |
+
setp.lt.u64 %p8, %rd44, 50257;
|
115 |
+
setp.lt.u64 %p6, %rd43, 50257;
|
116 |
+
.loc 1 36 34
|
117 |
+
shl.b64 %rd51, %rd43, 2;
|
118 |
+
add.s64 %rd35, %rd10, %rd51;
|
119 |
+
shl.b64 %rd52, %rd44, 2;
|
120 |
+
add.s64 %rd36, %rd10, %rd52;
|
121 |
+
shl.b64 %rd53, %rd45, 2;
|
122 |
+
add.s64 %rd37, %rd10, %rd53;
|
123 |
+
shl.b64 %rd54, %rd46, 2;
|
124 |
+
add.s64 %rd38, %rd10, %rd54;
|
125 |
+
shl.b64 %rd55, %rd47, 2;
|
126 |
+
add.s64 %rd39, %rd10, %rd55;
|
127 |
+
shl.b64 %rd56, %rd48, 2;
|
128 |
+
add.s64 %rd40, %rd10, %rd56;
|
129 |
+
shl.b64 %rd57, %rd49, 2;
|
130 |
+
add.s64 %rd41, %rd10, %rd57;
|
131 |
+
shl.b64 %rd58, %rd50, 2;
|
132 |
+
add.s64 %rd42, %rd10, %rd58;
|
133 |
+
mov.b32 %r71, 0;
|
134 |
+
.loc 1 36 52
|
135 |
+
mov.u32 %r17, 0x0;
|
136 |
+
@%p6 ld.global.L1::evict_last.b32 { %r17 }, [ %rd35 + 0 ];
|
137 |
+
@!%p6 mov.u32 %r17, %r71;
|
138 |
+
mov.u32 %r19, 0x0;
|
139 |
+
@%p8 ld.global.L1::evict_last.b32 { %r19 }, [ %rd36 + 0 ];
|
140 |
+
@!%p8 mov.u32 %r19, %r71;
|
141 |
+
mov.u32 %r21, 0x0;
|
142 |
+
@%p10 ld.global.L1::evict_last.b32 { %r21 }, [ %rd37 + 0 ];
|
143 |
+
@!%p10 mov.u32 %r21, %r71;
|
144 |
+
mov.u32 %r23, 0x0;
|
145 |
+
@%p12 ld.global.L1::evict_last.b32 { %r23 }, [ %rd38 + 0 ];
|
146 |
+
@!%p12 mov.u32 %r23, %r71;
|
147 |
+
mov.u32 %r25, 0x0;
|
148 |
+
@%p14 ld.global.L1::evict_last.b32 { %r25 }, [ %rd39 + 0 ];
|
149 |
+
@!%p14 mov.u32 %r25, %r71;
|
150 |
+
mov.u32 %r27, 0x0;
|
151 |
+
@%p16 ld.global.L1::evict_last.b32 { %r27 }, [ %rd40 + 0 ];
|
152 |
+
@!%p16 mov.u32 %r27, %r71;
|
153 |
+
mov.u32 %r29, 0x0;
|
154 |
+
@%p18 ld.global.L1::evict_last.b32 { %r29 }, [ %rd41 + 0 ];
|
155 |
+
@!%p18 mov.u32 %r29, %r71;
|
156 |
+
mov.u32 %r31, 0x0;
|
157 |
+
@%p20 ld.global.L1::evict_last.b32 { %r31 }, [ %rd42 + 0 ];
|
158 |
+
@!%p20 mov.u32 %r31, %r71;
|
159 |
+
mov.b32 %f36, %r31;
|
160 |
+
mov.b32 %f37, %r29;
|
161 |
+
mov.b32 %f38, %r27;
|
162 |
+
mov.b32 %f39, %r25;
|
163 |
+
mov.b32 %f40, %r23;
|
164 |
+
mov.b32 %f41, %r21;
|
165 |
+
mov.b32 %f42, %r19;
|
166 |
+
mov.b32 %f43, %r17;
|
167 |
+
.loc 1 42 23
|
168 |
+
mul.f32 %f44, %f2, %f43;
|
169 |
+
mul.f32 %f45, %f2, %f42;
|
170 |
+
mul.f32 %f46, %f2, %f41;
|
171 |
+
mul.f32 %f47, %f2, %f40;
|
172 |
+
mul.f32 %f48, %f2, %f39;
|
173 |
+
mul.f32 %f49, %f2, %f38;
|
174 |
+
mul.f32 %f50, %f2, %f37;
|
175 |
+
mul.f32 %f51, %f2, %f36;
|
176 |
+
.loc 1 45 40
|
177 |
+
selp.f32 %f52, %f51, 0f80000000, %p20;
|
178 |
+
selp.f32 %f53, %f50, 0f80000000, %p18;
|
179 |
+
selp.f32 %f54, %f49, 0f80000000, %p16;
|
180 |
+
selp.f32 %f55, %f48, 0f80000000, %p14;
|
181 |
+
selp.f32 %f56, %f47, 0f80000000, %p12;
|
182 |
+
selp.f32 %f57, %f46, 0f80000000, %p10;
|
183 |
+
selp.f32 %f58, %f45, 0f80000000, %p8;
|
184 |
+
selp.f32 %f59, %f44, 0f80000000, %p6;
|
185 |
+
add.f32 %f156, %f156, %f59;
|
186 |
+
add.f32 %f157, %f157, %f58;
|
187 |
+
add.f32 %f158, %f158, %f57;
|
188 |
+
add.f32 %f159, %f159, %f56;
|
189 |
+
add.f32 %f160, %f160, %f55;
|
190 |
+
add.f32 %f161, %f161, %f54;
|
191 |
+
add.f32 %f162, %f162, %f53;
|
192 |
+
add.f32 %f163, %f163, %f52;
|
193 |
+
.loc 1 32 36
|
194 |
+
add.s64 %rd124, %rd124, 2048;
|
195 |
+
cvt.u32.u64 %r33, %rd124;
|
196 |
+
add.s32 %r34, %r33, -2048;
|
197 |
+
setp.lt.u32 %p21, %r34, 48209;
|
198 |
+
@%p21 bra $L__BB0_1;
|
199 |
+
.loc 1 24 33
|
200 |
+
and.b32 %r41, %r1, 31;
|
201 |
+
and.b32 %r42, %r2, 7;
|
202 |
+
$L__tmp1:
|
203 |
+
.loc 2 233 15
|
204 |
+
add.f32 %f60, %f156, %f157;
|
205 |
+
add.f32 %f61, %f158, %f60;
|
206 |
+
add.f32 %f62, %f159, %f61;
|
207 |
+
add.f32 %f63, %f160, %f62;
|
208 |
+
add.f32 %f64, %f161, %f63;
|
209 |
+
add.f32 %f65, %f162, %f64;
|
210 |
+
add.f32 %f66, %f163, %f65;
|
211 |
+
$L__tmp2:
|
212 |
+
.loc 2 243 36
|
213 |
+
mov.b32 %r43, %f66;
|
214 |
+
shfl.sync.bfly.b32 %r44, %r43, 16, 31, -1;
|
215 |
+
mov.b32 %f67, %r44;
|
216 |
+
$L__tmp3:
|
217 |
+
.loc 2 233 15
|
218 |
+
add.f32 %f68, %f66, %f67;
|
219 |
+
$L__tmp4:
|
220 |
+
.loc 2 243 36
|
221 |
+
mov.b32 %r45, %f68;
|
222 |
+
shfl.sync.bfly.b32 %r46, %r45, 8, 31, -1;
|
223 |
+
mov.b32 %f69, %r46;
|
224 |
+
$L__tmp5:
|
225 |
+
.loc 2 233 15
|
226 |
+
add.f32 %f70, %f68, %f69;
|
227 |
+
$L__tmp6:
|
228 |
+
.loc 2 243 36
|
229 |
+
mov.b32 %r47, %f70;
|
230 |
+
shfl.sync.bfly.b32 %r48, %r47, 4, 31, -1;
|
231 |
+
mov.b32 %f71, %r48;
|
232 |
+
$L__tmp7:
|
233 |
+
.loc 2 233 15
|
234 |
+
add.f32 %f72, %f70, %f71;
|
235 |
+
$L__tmp8:
|
236 |
+
.loc 2 243 36
|
237 |
+
mov.b32 %r49, %f72;
|
238 |
+
shfl.sync.bfly.b32 %r50, %r49, 2, 31, -1;
|
239 |
+
mov.b32 %f73, %r50;
|
240 |
+
$L__tmp9:
|
241 |
+
.loc 2 233 15
|
242 |
+
add.f32 %f74, %f72, %f73;
|
243 |
+
$L__tmp10:
|
244 |
+
.loc 2 243 36
|
245 |
+
mov.b32 %r51, %f74;
|
246 |
+
shfl.sync.bfly.b32 %r52, %r51, 1, 31, -1;
|
247 |
+
mov.b32 %f75, %r52;
|
248 |
+
$L__tmp11:
|
249 |
+
.loc 2 233 15
|
250 |
+
add.f32 %f76, %f74, %f75;
|
251 |
+
$L__tmp12:
|
252 |
+
.loc 2 243 36
|
253 |
+
setp.eq.s32 %p22, %r41, 0;
|
254 |
+
shl.b32 %r53, %r42, 2;
|
255 |
+
mov.u32 %r54, global_smem;
|
256 |
+
add.s32 %r35, %r54, %r53;
|
257 |
+
mov.b32 %r36, %f76;
|
258 |
+
@%p22 st.shared.b32 [ %r35 + 0 ], %r36;
|
259 |
+
bar.sync 0;
|
260 |
+
setp.lt.s32 %p23, %r1, 8;
|
261 |
+
shl.b32 %r55, %r1, 2;
|
262 |
+
add.s32 %r38, %r54, %r55;
|
263 |
+
@%p23 ld.shared.b32 %r37, [ %r38 + 0 ];
|
264 |
+
mov.b32 %f77, %r37;
|
265 |
+
shfl.sync.bfly.b32 %r56, %r37, 4, 31, -1;
|
266 |
+
mov.b32 %f78, %r56;
|
267 |
+
$L__tmp13:
|
268 |
+
.loc 2 233 15
|
269 |
+
add.f32 %f79, %f77, %f78;
|
270 |
+
$L__tmp14:
|
271 |
+
.loc 2 243 36
|
272 |
+
mov.b32 %r57, %f79;
|
273 |
+
shfl.sync.bfly.b32 %r58, %r57, 2, 31, -1;
|
274 |
+
mov.b32 %f80, %r58;
|
275 |
+
$L__tmp15:
|
276 |
+
.loc 2 233 15
|
277 |
+
add.f32 %f81, %f79, %f80;
|
278 |
+
$L__tmp16:
|
279 |
+
.loc 2 243 36
|
280 |
+
mov.b32 %r59, %f81;
|
281 |
+
shfl.sync.bfly.b32 %r60, %r59, 1, 31, -1;
|
282 |
+
mov.b32 %f82, %r60;
|
283 |
+
$L__tmp17:
|
284 |
+
.loc 2 233 15
|
285 |
+
add.f32 %f83, %f81, %f82;
|
286 |
+
$L__tmp18:
|
287 |
+
.loc 2 243 36
|
288 |
+
and.b32 %r61, %r1, 7;
|
289 |
+
setp.eq.s32 %p25, %r61, 0;
|
290 |
+
and.pred %p24, %p23, %p25;
|
291 |
+
mov.b32 %r40, %f83;
|
292 |
+
@%p24 st.shared.b32 [ %r38 + 0 ], %r40;
|
293 |
+
bar.sync 0;
|
294 |
+
ld.shared.f32 %f26, [global_smem];
|
295 |
+
mov.u64 %rd125, 0;
|
296 |
+
mov.u16 %rs2, 0;
|
297 |
+
$L__tmp19:
|
298 |
+
$L__BB0_3:
|
299 |
+
.loc 1 52 27
|
300 |
+
or.b64 %rd92, %rd1, %rd125;
|
301 |
+
or.b64 %rd93, %rd2, %rd125;
|
302 |
+
or.b64 %rd94, %rd3, %rd125;
|
303 |
+
or.b64 %rd95, %rd4, %rd125;
|
304 |
+
or.b64 %rd96, %rd5, %rd125;
|
305 |
+
or.b64 %rd97, %rd6, %rd125;
|
306 |
+
or.b64 %rd98, %rd7, %rd125;
|
307 |
+
or.b64 %rd99, %rd8, %rd125;
|
308 |
+
.loc 1 53 25
|
309 |
+
setp.lt.u64 %p26, %rd92, 50257;
|
310 |
+
setp.lt.u64 %p28, %rd93, 50257;
|
311 |
+
setp.lt.u64 %p30, %rd94, 50257;
|
312 |
+
setp.lt.u64 %p32, %rd95, 50257;
|
313 |
+
setp.lt.u64 %p34, %rd96, 50257;
|
314 |
+
setp.lt.u64 %p36, %rd97, 50257;
|
315 |
+
setp.lt.u64 %p38, %rd98, 50257;
|
316 |
+
setp.lt.u64 %p40, %rd99, 50257;
|
317 |
+
.loc 1 55 41
|
318 |
+
add.s64 %rd100, %rd92, %rd9;
|
319 |
+
add.s64 %rd101, %rd93, %rd9;
|
320 |
+
add.s64 %rd102, %rd94, %rd9;
|
321 |
+
add.s64 %rd103, %rd95, %rd9;
|
322 |
+
add.s64 %rd104, %rd96, %rd9;
|
323 |
+
add.s64 %rd105, %rd97, %rd9;
|
324 |
+
add.s64 %rd106, %rd98, %rd9;
|
325 |
+
add.s64 %rd107, %rd99, %rd9;
|
326 |
+
.loc 1 55 35
|
327 |
+
shl.b64 %rd108, %rd100, 1;
|
328 |
+
add.s64 %rd60, %rd23, %rd108;
|
329 |
+
shl.b64 %rd109, %rd101, 1;
|
330 |
+
add.s64 %rd61, %rd23, %rd109;
|
331 |
+
shl.b64 %rd110, %rd102, 1;
|
332 |
+
add.s64 %rd62, %rd23, %rd110;
|
333 |
+
shl.b64 %rd111, %rd103, 1;
|
334 |
+
add.s64 %rd63, %rd23, %rd111;
|
335 |
+
shl.b64 %rd112, %rd104, 1;
|
336 |
+
add.s64 %rd64, %rd23, %rd112;
|
337 |
+
shl.b64 %rd113, %rd105, 1;
|
338 |
+
add.s64 %rd65, %rd23, %rd113;
|
339 |
+
shl.b64 %rd114, %rd106, 1;
|
340 |
+
add.s64 %rd66, %rd23, %rd114;
|
341 |
+
shl.b64 %rd115, %rd107, 1;
|
342 |
+
add.s64 %rd67, %rd23, %rd115;
|
343 |
+
.loc 1 55 53
|
344 |
+
mov.u16 %rs1, 0x0;
|
345 |
+
@%p26 ld.global.L1::evict_first.b16 { %rs1 }, [ %rd60 + 0 ];
|
346 |
+
@!%p26 mov.u16 %rs1, %rs2;
|
347 |
+
mov.u16 %rs3, 0x0;
|
348 |
+
@%p28 ld.global.L1::evict_first.b16 { %rs3 }, [ %rd61 + 0 ];
|
349 |
+
@!%p28 mov.u16 %rs3, %rs2;
|
350 |
+
mov.u16 %rs5, 0x0;
|
351 |
+
@%p30 ld.global.L1::evict_first.b16 { %rs5 }, [ %rd62 + 0 ];
|
352 |
+
@!%p30 mov.u16 %rs5, %rs2;
|
353 |
+
mov.u16 %rs7, 0x0;
|
354 |
+
@%p32 ld.global.L1::evict_first.b16 { %rs7 }, [ %rd63 + 0 ];
|
355 |
+
@!%p32 mov.u16 %rs7, %rs2;
|
356 |
+
mov.u16 %rs9, 0x0;
|
357 |
+
@%p34 ld.global.L1::evict_first.b16 { %rs9 }, [ %rd64 + 0 ];
|
358 |
+
@!%p34 mov.u16 %rs9, %rs2;
|
359 |
+
mov.u16 %rs11, 0x0;
|
360 |
+
@%p36 ld.global.L1::evict_first.b16 { %rs11 }, [ %rd65 + 0 ];
|
361 |
+
@!%p36 mov.u16 %rs11, %rs2;
|
362 |
+
mov.u16 %rs13, 0x0;
|
363 |
+
@%p38 ld.global.L1::evict_first.b16 { %rs13 }, [ %rd66 + 0 ];
|
364 |
+
@!%p38 mov.u16 %rs13, %rs2;
|
365 |
+
mov.u16 %rs15, 0x0;
|
366 |
+
@%p40 ld.global.L1::evict_first.b16 { %rs15 }, [ %rd67 + 0 ];
|
367 |
+
@!%p40 mov.u16 %rs15, %rs2;
|
368 |
+
.loc 1 55 105
|
369 |
+
cvt.f32.bf16 %r62, %rs1;
|
370 |
+
mov.b32 %f100, %r62;
|
371 |
+
cvt.f32.bf16 %r63, %rs3;
|
372 |
+
mov.b32 %f101, %r63;
|
373 |
+
cvt.f32.bf16 %r64, %rs5;
|
374 |
+
mov.b32 %f102, %r64;
|
375 |
+
cvt.f32.bf16 %r65, %rs7;
|
376 |
+
mov.b32 %f103, %r65;
|
377 |
+
cvt.f32.bf16 %r66, %rs9;
|
378 |
+
mov.b32 %f104, %r66;
|
379 |
+
cvt.f32.bf16 %r67, %rs11;
|
380 |
+
mov.b32 %f105, %r67;
|
381 |
+
cvt.f32.bf16 %r68, %rs13;
|
382 |
+
mov.b32 %f106, %r68;
|
383 |
+
cvt.f32.bf16 %r69, %rs15;
|
384 |
+
mov.b32 %f107, %r69;
|
385 |
+
.loc 1 56 35
|
386 |
+
shl.b64 %rd116, %rd92, 2;
|
387 |
+
add.s64 %rd68, %rd10, %rd116;
|
388 |
+
shl.b64 %rd117, %rd93, 2;
|
389 |
+
add.s64 %rd69, %rd10, %rd117;
|
390 |
+
shl.b64 %rd118, %rd94, 2;
|
391 |
+
add.s64 %rd70, %rd10, %rd118;
|
392 |
+
shl.b64 %rd119, %rd95, 2;
|
393 |
+
add.s64 %rd71, %rd10, %rd119;
|
394 |
+
shl.b64 %rd120, %rd96, 2;
|
395 |
+
add.s64 %rd72, %rd10, %rd120;
|
396 |
+
shl.b64 %rd121, %rd97, 2;
|
397 |
+
add.s64 %rd73, %rd10, %rd121;
|
398 |
+
shl.b64 %rd122, %rd98, 2;
|
399 |
+
add.s64 %rd74, %rd10, %rd122;
|
400 |
+
shl.b64 %rd123, %rd99, 2;
|
401 |
+
add.s64 %rd75, %rd10, %rd123;
|
402 |
+
.loc 1 56 53
|
403 |
+
mov.u32 %r70, 0x0;
|
404 |
+
@%p26 ld.global.L1::evict_first.b32 { %r70 }, [ %rd68 + 0 ];
|
405 |
+
@!%p26 mov.u32 %r70, %r71;
|
406 |
+
mov.b32 %f108, %r70;
|
407 |
+
mov.u32 %r72, 0x0;
|
408 |
+
@%p28 ld.global.L1::evict_first.b32 { %r72 }, [ %rd69 + 0 ];
|
409 |
+
@!%p28 mov.u32 %r72, %r71;
|
410 |
+
mov.b32 %f109, %r72;
|
411 |
+
mov.u32 %r74, 0x0;
|
412 |
+
@%p30 ld.global.L1::evict_first.b32 { %r74 }, [ %rd70 + 0 ];
|
413 |
+
@!%p30 mov.u32 %r74, %r71;
|
414 |
+
mov.b32 %f110, %r74;
|
415 |
+
mov.u32 %r76, 0x0;
|
416 |
+
@%p32 ld.global.L1::evict_first.b32 { %r76 }, [ %rd71 + 0 ];
|
417 |
+
@!%p32 mov.u32 %r76, %r71;
|
418 |
+
mov.b32 %f111, %r76;
|
419 |
+
mov.u32 %r78, 0x0;
|
420 |
+
@%p34 ld.global.L1::evict_first.b32 { %r78 }, [ %rd72 + 0 ];
|
421 |
+
@!%p34 mov.u32 %r78, %r71;
|
422 |
+
mov.b32 %f112, %r78;
|
423 |
+
mov.u32 %r80, 0x0;
|
424 |
+
@%p36 ld.global.L1::evict_first.b32 { %r80 }, [ %rd73 + 0 ];
|
425 |
+
@!%p36 mov.u32 %r80, %r71;
|
426 |
+
mov.b32 %f113, %r80;
|
427 |
+
mov.u32 %r82, 0x0;
|
428 |
+
@%p38 ld.global.L1::evict_first.b32 { %r82 }, [ %rd74 + 0 ];
|
429 |
+
@!%p38 mov.u32 %r82, %r71;
|
430 |
+
mov.b32 %f114, %r82;
|
431 |
+
mov.u32 %r84, 0x0;
|
432 |
+
@%p40 ld.global.L1::evict_first.b32 { %r84 }, [ %rd75 + 0 ];
|
433 |
+
@!%p40 mov.u32 %r84, %r71;
|
434 |
+
mov.b32 %f115, %r84;
|
435 |
+
.loc 1 57 35
|
436 |
+
add.s64 %rd76, %rd24, %rd108;
|
437 |
+
add.s64 %rd77, %rd24, %rd109;
|
438 |
+
add.s64 %rd78, %rd24, %rd110;
|
439 |
+
add.s64 %rd79, %rd24, %rd111;
|
440 |
+
add.s64 %rd80, %rd24, %rd112;
|
441 |
+
add.s64 %rd81, %rd24, %rd113;
|
442 |
+
add.s64 %rd82, %rd24, %rd114;
|
443 |
+
add.s64 %rd83, %rd24, %rd115;
|
444 |
+
.loc 1 57 53
|
445 |
+
mov.u16 %rs25, 0x0;
|
446 |
+
@%p26 ld.global.L1::evict_first.b16 { %rs25 }, [ %rd76 + 0 ];
|
447 |
+
@!%p26 mov.u16 %rs25, %rs2;
|
448 |
+
mov.u16 %rs27, 0x0;
|
449 |
+
@%p28 ld.global.L1::evict_first.b16 { %rs27 }, [ %rd77 + 0 ];
|
450 |
+
@!%p28 mov.u16 %rs27, %rs2;
|
451 |
+
mov.u16 %rs29, 0x0;
|
452 |
+
@%p30 ld.global.L1::evict_first.b16 { %rs29 }, [ %rd78 + 0 ];
|
453 |
+
@!%p30 mov.u16 %rs29, %rs2;
|
454 |
+
mov.u16 %rs31, 0x0;
|
455 |
+
@%p32 ld.global.L1::evict_first.b16 { %rs31 }, [ %rd79 + 0 ];
|
456 |
+
@!%p32 mov.u16 %rs31, %rs2;
|
457 |
+
mov.u16 %rs33, 0x0;
|
458 |
+
@%p34 ld.global.L1::evict_first.b16 { %rs33 }, [ %rd80 + 0 ];
|
459 |
+
@!%p34 mov.u16 %rs33, %rs2;
|
460 |
+
mov.u16 %rs35, 0x0;
|
461 |
+
@%p36 ld.global.L1::evict_first.b16 { %rs35 }, [ %rd81 + 0 ];
|
462 |
+
@!%p36 mov.u16 %rs35, %rs2;
|
463 |
+
mov.u16 %rs37, 0x0;
|
464 |
+
@%p38 ld.global.L1::evict_first.b16 { %rs37 }, [ %rd82 + 0 ];
|
465 |
+
@!%p38 mov.u16 %rs37, %rs2;
|
466 |
+
mov.u16 %rs39, 0x0;
|
467 |
+
@%p40 ld.global.L1::evict_first.b16 { %rs39 }, [ %rd83 + 0 ];
|
468 |
+
@!%p40 mov.u16 %rs39, %rs2;
|
469 |
+
.loc 1 57 105
|
470 |
+
cvt.f32.bf16 %r86, %rs25;
|
471 |
+
mov.b32 %f116, %r86;
|
472 |
+
cvt.f32.bf16 %r87, %rs27;
|
473 |
+
mov.b32 %f117, %r87;
|
474 |
+
cvt.f32.bf16 %r88, %rs29;
|
475 |
+
mov.b32 %f118, %r88;
|
476 |
+
cvt.f32.bf16 %r89, %rs31;
|
477 |
+
mov.b32 %f119, %r89;
|
478 |
+
cvt.f32.bf16 %r90, %rs33;
|
479 |
+
mov.b32 %f120, %r90;
|
480 |
+
cvt.f32.bf16 %r91, %rs35;
|
481 |
+
mov.b32 %f121, %r91;
|
482 |
+
cvt.f32.bf16 %r92, %rs37;
|
483 |
+
mov.b32 %f122, %r92;
|
484 |
+
cvt.f32.bf16 %r93, %rs39;
|
485 |
+
mov.b32 %f123, %r93;
|
486 |
+
.loc 1 65 23
|
487 |
+
mul.f32 %f85, %f116, 0f3FB8AA3B;
|
488 |
+
ex2.approx.f32 %f84, %f85;
|
489 |
+
mul.f32 %f87, %f117, 0f3FB8AA3B;
|
490 |
+
ex2.approx.f32 %f86, %f87;
|
491 |
+
mul.f32 %f89, %f118, 0f3FB8AA3B;
|
492 |
+
ex2.approx.f32 %f88, %f89;
|
493 |
+
mul.f32 %f91, %f119, 0f3FB8AA3B;
|
494 |
+
ex2.approx.f32 %f90, %f91;
|
495 |
+
mul.f32 %f93, %f120, 0f3FB8AA3B;
|
496 |
+
ex2.approx.f32 %f92, %f93;
|
497 |
+
mul.f32 %f95, %f121, 0f3FB8AA3B;
|
498 |
+
ex2.approx.f32 %f94, %f95;
|
499 |
+
mul.f32 %f97, %f122, 0f3FB8AA3B;
|
500 |
+
ex2.approx.f32 %f96, %f97;
|
501 |
+
mul.f32 %f99, %f123, 0f3FB8AA3B;
|
502 |
+
ex2.approx.f32 %f98, %f99;
|
503 |
+
.loc 1 66 24
|
504 |
+
mul.f32 %f124, %f26, %f84;
|
505 |
+
mul.f32 %f125, %f26, %f86;
|
506 |
+
mul.f32 %f126, %f26, %f88;
|
507 |
+
mul.f32 %f127, %f26, %f90;
|
508 |
+
mul.f32 %f128, %f26, %f92;
|
509 |
+
mul.f32 %f129, %f26, %f94;
|
510 |
+
mul.f32 %f130, %f26, %f96;
|
511 |
+
mul.f32 %f131, %f26, %f98;
|
512 |
+
.loc 1 67 24
|
513 |
+
neg.f32 %f132, %f124;
|
514 |
+
fma.rn.f32 %f133, %f2, %f108, %f132;
|
515 |
+
neg.f32 %f134, %f125;
|
516 |
+
fma.rn.f32 %f135, %f2, %f109, %f134;
|
517 |
+
neg.f32 %f136, %f126;
|
518 |
+
fma.rn.f32 %f137, %f2, %f110, %f136;
|
519 |
+
neg.f32 %f138, %f127;
|
520 |
+
fma.rn.f32 %f139, %f2, %f111, %f138;
|
521 |
+
neg.f32 %f140, %f128;
|
522 |
+
fma.rn.f32 %f141, %f2, %f112, %f140;
|
523 |
+
neg.f32 %f142, %f129;
|
524 |
+
fma.rn.f32 %f143, %f2, %f113, %f142;
|
525 |
+
neg.f32 %f144, %f130;
|
526 |
+
fma.rn.f32 %f145, %f2, %f114, %f144;
|
527 |
+
neg.f32 %f146, %f131;
|
528 |
+
fma.rn.f32 %f147, %f2, %f115, %f146;
|
529 |
+
.loc 1 69 24
|
530 |
+
add.f32 %f148, %f100, %f133;
|
531 |
+
add.f32 %f149, %f101, %f135;
|
532 |
+
add.f32 %f150, %f102, %f137;
|
533 |
+
add.f32 %f151, %f103, %f139;
|
534 |
+
add.f32 %f152, %f104, %f141;
|
535 |
+
add.f32 %f153, %f105, %f143;
|
536 |
+
add.f32 %f154, %f106, %f145;
|
537 |
+
add.f32 %f155, %f107, %f147;
|
538 |
+
.loc 1 70 29
|
539 |
+
add.s64 %rd84, %rd25, %rd108;
|
540 |
+
add.s64 %rd85, %rd25, %rd109;
|
541 |
+
add.s64 %rd86, %rd25, %rd110;
|
542 |
+
add.s64 %rd87, %rd25, %rd111;
|
543 |
+
add.s64 %rd88, %rd25, %rd112;
|
544 |
+
add.s64 %rd89, %rd25, %rd113;
|
545 |
+
add.s64 %rd90, %rd25, %rd114;
|
546 |
+
add.s64 %rd91, %rd25, %rd115;
|
547 |
+
.loc 1 70 54
|
548 |
+
mov.b32 %r94, %f148;
|
549 |
+
cvt.rn.bf16.f32 %rs49, %r94;
|
550 |
+
mov.b32 %r95, %f149;
|
551 |
+
cvt.rn.bf16.f32 %rs50, %r95;
|
552 |
+
mov.b32 %r96, %f150;
|
553 |
+
cvt.rn.bf16.f32 %rs51, %r96;
|
554 |
+
mov.b32 %r97, %f151;
|
555 |
+
cvt.rn.bf16.f32 %rs52, %r97;
|
556 |
+
mov.b32 %r98, %f152;
|
557 |
+
cvt.rn.bf16.f32 %rs53, %r98;
|
558 |
+
mov.b32 %r99, %f153;
|
559 |
+
cvt.rn.bf16.f32 %rs54, %r99;
|
560 |
+
mov.b32 %r100, %f154;
|
561 |
+
cvt.rn.bf16.f32 %rs55, %r100;
|
562 |
+
mov.b32 %r101, %f155;
|
563 |
+
cvt.rn.bf16.f32 %rs56, %r101;
|
564 |
+
@%p26 st.global.b16 [ %rd84 + 0 ], { %rs49 };
|
565 |
+
@%p28 st.global.b16 [ %rd85 + 0 ], { %rs50 };
|
566 |
+
@%p30 st.global.b16 [ %rd86 + 0 ], { %rs51 };
|
567 |
+
@%p32 st.global.b16 [ %rd87 + 0 ], { %rs52 };
|
568 |
+
@%p34 st.global.b16 [ %rd88 + 0 ], { %rs53 };
|
569 |
+
@%p36 st.global.b16 [ %rd89 + 0 ], { %rs54 };
|
570 |
+
@%p38 st.global.b16 [ %rd90 + 0 ], { %rs55 };
|
571 |
+
@%p40 st.global.b16 [ %rd91 + 0 ], { %rs56 };
|
572 |
+
.loc 1 51 36
|
573 |
+
add.s64 %rd125, %rd125, 2048;
|
574 |
+
cvt.u32.u64 %r102, %rd125;
|
575 |
+
add.s32 %r103, %r102, -2048;
|
576 |
+
setp.lt.u32 %p82, %r103, 48209;
|
577 |
+
@%p82 bra $L__BB0_3;
|
578 |
+
.loc 1 51 4
|
579 |
+
ret;
|
580 |
+
$L__tmp20:
|
581 |
+
$L__func_end0:
|
582 |
+
|
583 |
+
}
|
584 |
+
.file 1 "/tmp/torchinductor_root/kz/ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py"
|
585 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
586 |
+
.section .debug_abbrev
|
587 |
+
{
|
588 |
+
.b8 1
|
589 |
+
.b8 17
|
590 |
+
.b8 1
|
591 |
+
.b8 37
|
592 |
+
.b8 8
|
593 |
+
.b8 19
|
594 |
+
.b8 5
|
595 |
+
.b8 3
|
596 |
+
.b8 8
|
597 |
+
.b8 16
|
598 |
+
.b8 6
|
599 |
+
.b8 27
|
600 |
+
.b8 8
|
601 |
+
.b8 180
|
602 |
+
.b8 66
|
603 |
+
.b8 12
|
604 |
+
.b8 17
|
605 |
+
.b8 1
|
606 |
+
.b8 18
|
607 |
+
.b8 1
|
608 |
+
.b8 0
|
609 |
+
.b8 0
|
610 |
+
.b8 2
|
611 |
+
.b8 46
|
612 |
+
.b8 0
|
613 |
+
.b8 135
|
614 |
+
.b8 64
|
615 |
+
.b8 8
|
616 |
+
.b8 3
|
617 |
+
.b8 8
|
618 |
+
.b8 58
|
619 |
+
.b8 11
|
620 |
+
.b8 59
|
621 |
+
.b8 11
|
622 |
+
.b8 63
|
623 |
+
.b8 12
|
624 |
+
.b8 32
|
625 |
+
.b8 11
|
626 |
+
.b8 0
|
627 |
+
.b8 0
|
628 |
+
.b8 3
|
629 |
+
.b8 46
|
630 |
+
.b8 1
|
631 |
+
.b8 17
|
632 |
+
.b8 1
|
633 |
+
.b8 18
|
634 |
+
.b8 1
|
635 |
+
.b8 64
|
636 |
+
.b8 10
|
637 |
+
.b8 49
|
638 |
+
.b8 19
|
639 |
+
.b8 0
|
640 |
+
.b8 0
|
641 |
+
.b8 4
|
642 |
+
.b8 29
|
643 |
+
.b8 1
|
644 |
+
.b8 49
|
645 |
+
.b8 19
|
646 |
+
.b8 17
|
647 |
+
.b8 1
|
648 |
+
.b8 18
|
649 |
+
.b8 1
|
650 |
+
.b8 88
|
651 |
+
.b8 11
|
652 |
+
.b8 89
|
653 |
+
.b8 11
|
654 |
+
.b8 87
|
655 |
+
.b8 11
|
656 |
+
.b8 0
|
657 |
+
.b8 0
|
658 |
+
.b8 5
|
659 |
+
.b8 29
|
660 |
+
.b8 0
|
661 |
+
.b8 49
|
662 |
+
.b8 19
|
663 |
+
.b8 17
|
664 |
+
.b8 1
|
665 |
+
.b8 18
|
666 |
+
.b8 1
|
667 |
+
.b8 88
|
668 |
+
.b8 11
|
669 |
+
.b8 89
|
670 |
+
.b8 11
|
671 |
+
.b8 87
|
672 |
+
.b8 11
|
673 |
+
.b8 0
|
674 |
+
.b8 0
|
675 |
+
.b8 0
|
676 |
+
}
|
677 |
+
.section .debug_info
|
678 |
+
{
|
679 |
+
.b32 278
|
680 |
+
.b8 2
|
681 |
+
.b8 0
|
682 |
+
.b32 .debug_abbrev
|
683 |
+
.b8 8
|
684 |
+
.b8 1
|
685 |
+
.b8 116
|
686 |
+
.b8 114
|
687 |
+
.b8 105
|
688 |
+
.b8 116
|
689 |
+
.b8 111
|
690 |
+
.b8 110
|
691 |
+
.b8 0
|
692 |
+
.b8 2
|
693 |
+
.b8 0
|
694 |
+
.b8 99
|
695 |
+
.b8 107
|
696 |
+
.b8 122
|
697 |
+
.b8 103
|
698 |
+
.b8 108
|
699 |
+
.b8 55
|
700 |
+
.b8 116
|
701 |
+
.b8 104
|
702 |
+
.b8 98
|
703 |
+
.b8 52
|
704 |
+
.b8 120
|
705 |
+
.b8 100
|
706 |
+
.b8 102
|
707 |
+
.b8 107
|
708 |
+
.b8 102
|
709 |
+
.b8 110
|
710 |
+
.b8 100
|
711 |
+
.b8 50
|
712 |
+
.b8 116
|
713 |
+
.b8 105
|
714 |
+
.b8 100
|
715 |
+
.b8 107
|
716 |
+
.b8 115
|
717 |
+
.b8 54
|
718 |
+
.b8 109
|
719 |
+
.b8 116
|
720 |
+
.b8 53
|
721 |
+
.b8 102
|
722 |
+
.b8 51
|
723 |
+
.b8 104
|
724 |
+
.b8 97
|
725 |
+
.b8 117
|
726 |
+
.b8 119
|
727 |
+
.b8 102
|
728 |
+
.b8 121
|
729 |
+
.b8 106
|
730 |
+
.b8 102
|
731 |
+
.b8 108
|
732 |
+
.b8 98
|
733 |
+
.b8 116
|
734 |
+
.b8 122
|
735 |
+
.b8 121
|
736 |
+
.b8 101
|
737 |
+
.b8 112
|
738 |
+
.b8 111
|
739 |
+
.b8 53
|
740 |
+
.b8 111
|
741 |
+
.b8 120
|
742 |
+
.b8 107
|
743 |
+
.b8 118
|
744 |
+
.b8 104
|
745 |
+
.b8 107
|
746 |
+
.b8 46
|
747 |
+
.b8 112
|
748 |
+
.b8 121
|
749 |
+
.b8 0
|
750 |
+
.b32 .debug_line
|
751 |
+
.b8 47
|
752 |
+
.b8 116
|
753 |
+
.b8 109
|
754 |
+
.b8 112
|
755 |
+
.b8 47
|
756 |
+
.b8 116
|
757 |
+
.b8 111
|
758 |
+
.b8 114
|
759 |
+
.b8 99
|
760 |
+
.b8 104
|
761 |
+
.b8 105
|
762 |
+
.b8 110
|
763 |
+
.b8 100
|
764 |
+
.b8 117
|
765 |
+
.b8 99
|
766 |
+
.b8 116
|
767 |
+
.b8 111
|
768 |
+
.b8 114
|
769 |
+
.b8 95
|
770 |
+
.b8 114
|
771 |
+
.b8 111
|
772 |
+
.b8 111
|
773 |
+
.b8 116
|
774 |
+
.b8 47
|
775 |
+
.b8 107
|
776 |
+
.b8 122
|
777 |
+
.b8 0
|
778 |
+
.b8 1
|
779 |
+
.b64 $L__func_begin0
|
780 |
+
.b64 $L__func_end0
|
781 |
+
.b8 2
|
782 |
+
.b8 116
|
783 |
+
.b8 114
|
784 |
+
.b8 105
|
785 |
+
.b8 116
|
786 |
+
.b8 111
|
787 |
+
.b8 110
|
788 |
+
.b8 95
|
789 |
+
.b8 95
|
790 |
+
.b8 48
|
791 |
+
.b8 100
|
792 |
+
.b8 49
|
793 |
+
.b8 100
|
794 |
+
.b8 50
|
795 |
+
.b8 100
|
796 |
+
.b8 51
|
797 |
+
.b8 100
|
798 |
+
.b8 52
|
799 |
+
.b8 100
|
800 |
+
.b8 53
|
801 |
+
.b8 100
|
802 |
+
.b8 54
|
803 |
+
.b8 100
|
804 |
+
.b8 55
|
805 |
+
.b8 100
|
806 |
+
.b8 101
|
807 |
+
.b8 56
|
808 |
+
.b8 0
|
809 |
+
.b8 116
|
810 |
+
.b8 114
|
811 |
+
.b8 105
|
812 |
+
.b8 116
|
813 |
+
.b8 111
|
814 |
+
.b8 110
|
815 |
+
.b8 95
|
816 |
+
.b8 95
|
817 |
+
.b8 48
|
818 |
+
.b8 100
|
819 |
+
.b8 49
|
820 |
+
.b8 100
|
821 |
+
.b8 50
|
822 |
+
.b8 100
|
823 |
+
.b8 51
|
824 |
+
.b8 100
|
825 |
+
.b8 52
|
826 |
+
.b8 100
|
827 |
+
.b8 53
|
828 |
+
.b8 100
|
829 |
+
.b8 54
|
830 |
+
.b8 100
|
831 |
+
.b8 55
|
832 |
+
.b8 100
|
833 |
+
.b8 101
|
834 |
+
.b8 56
|
835 |
+
.b8 0
|
836 |
+
.b8 1
|
837 |
+
.b8 18
|
838 |
+
.b8 1
|
839 |
+
.b8 1
|
840 |
+
.b8 3
|
841 |
+
.b64 $L__func_begin0
|
842 |
+
.b64 $L__func_end0
|
843 |
+
.b8 1
|
844 |
+
.b8 156
|
845 |
+
.b32 125
|
846 |
+
.b8 4
|
847 |
+
.b32 125
|
848 |
+
.b64 $L__tmp1
|
849 |
+
.b64 $L__tmp18
|
850 |
+
.b8 2
|
851 |
+
.b8 46
|
852 |
+
.b8 27
|
853 |
+
.b8 5
|
854 |
+
.b32 125
|
855 |
+
.b64 $L__tmp1
|
856 |
+
.b64 $L__tmp18
|
857 |
+
.b8 2
|
858 |
+
.b8 243
|
859 |
+
.b8 36
|
860 |
+
.b8 0
|
861 |
+
.b8 5
|
862 |
+
.b32 125
|
863 |
+
.b64 $L__tmp2
|
864 |
+
.b64 $L__tmp19
|
865 |
+
.b8 2
|
866 |
+
.b8 46
|
867 |
+
.b8 27
|
868 |
+
.b8 0
|
869 |
+
.b8 0
|
870 |
+
}
|
871 |
+
.section .debug_pubnames
|
872 |
+
{
|
873 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
874 |
+
$L__pubNames_start0:
|
875 |
+
.b8 2
|
876 |
+
.b8 0
|
877 |
+
.b32 .debug_info
|
878 |
+
.b32 282
|
879 |
+
.b32 125
|
880 |
+
.b8 116
|
881 |
+
.b8 114
|
882 |
+
.b8 105
|
883 |
+
.b8 116
|
884 |
+
.b8 111
|
885 |
+
.b8 110
|
886 |
+
.b8 95
|
887 |
+
.b8 95
|
888 |
+
.b8 48
|
889 |
+
.b8 100
|
890 |
+
.b8 49
|
891 |
+
.b8 100
|
892 |
+
.b8 50
|
893 |
+
.b8 100
|
894 |
+
.b8 51
|
895 |
+
.b8 100
|
896 |
+
.b8 52
|
897 |
+
.b8 100
|
898 |
+
.b8 53
|
899 |
+
.b8 100
|
900 |
+
.b8 54
|
901 |
+
.b8 100
|
902 |
+
.b8 55
|
903 |
+
.b8 100
|
904 |
+
.b8 101
|
905 |
+
.b8 56
|
906 |
+
.b8 0
|
907 |
+
.b32 0
|
908 |
+
$L__pubNames_end0:
|
909 |
+
}
|
910 |
+
.section .debug_pubtypes
|
911 |
+
{
|
912 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
913 |
+
$L__pubTypes_start0:
|
914 |
+
.b8 2
|
915 |
+
.b8 0
|
916 |
+
.b32 .debug_info
|
917 |
+
.b32 282
|
918 |
+
.b32 0
|
919 |
+
$L__pubTypes_end0:
|
920 |
+
}
|
921 |
+
.section .debug_loc { }
|
.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.ttgir
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<1x1xf32, #blocked>
|
5 |
+
%cst_0 = arith.constant dense<-1> : tensor<1x1xi64, #blocked>
|
6 |
+
%cst_1 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked>
|
7 |
+
%cst_2 = arith.constant dense<50257> : tensor<1x2048xi64, #blocked>
|
8 |
+
%c0_i32 = arith.constant 0 : i32
|
9 |
+
%c2048_i32 = arith.constant 2048 : i32
|
10 |
+
%c50257_i32 = arith.constant 50257 : i32
|
11 |
+
%c50257_i64 = arith.constant 50257 : i64
|
12 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked>
|
13 |
+
%0 = tt.get_program_id x : i32
|
14 |
+
%1 = arith.extsi %0 : i32 to i64
|
15 |
+
%2 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
16 |
+
%3 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x2048xi32, #blocked>
|
17 |
+
%4 = arith.extsi %3 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked>
|
18 |
+
%5 = tt.addptr %arg1, %1 : !tt.ptr<i64, 1>, i64
|
19 |
+
%6 = tt.splat %5 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked>
|
20 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x1xi64, #blocked>
|
21 |
+
%8 = tt.addptr %arg2, %c0_i32 : !tt.ptr<f32, 1>, i32
|
22 |
+
%9 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
|
23 |
+
%10 = tt.addptr %arg3, %c0_i32 : !tt.ptr<f32, 1>, i32
|
24 |
+
%11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
|
25 |
+
%12 = arith.muli %1, %c50257_i64 : i64
|
26 |
+
%13 = tt.splat %12 : (i64) -> tensor<1x2048xi64, #blocked>
|
27 |
+
%14 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1x2048x!tt.ptr<f32, 1>, #blocked>
|
28 |
+
%15 = arith.cmpi ne, %7, %cst_0 : tensor<1x1xi64, #blocked>
|
29 |
+
%16 = arith.divf %9, %11 : f32
|
30 |
+
%17 = tt.splat %16 : (f32) -> tensor<1x1xf32, #blocked>
|
31 |
+
%18 = arith.select %15, %17, %cst : tensor<1x1xi1, #blocked>, tensor<1x1xf32, #blocked>
|
32 |
+
%19 = tt.broadcast %18 : (tensor<1x1xf32, #blocked>) -> tensor<1x2048xf32, #blocked>
|
33 |
+
%20 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c2048_i32 iter_args(%arg10 = %cst_1) -> (tensor<1x2048xf32, #blocked>) : i32 {
|
34 |
+
%27 = arith.extsi %arg9 : i32 to i64
|
35 |
+
%28 = tt.splat %27 : (i64) -> tensor<1x2048xi64, #blocked>
|
36 |
+
%29 = arith.addi %28, %4 : tensor<1x2048xi64, #blocked>
|
37 |
+
%30 = arith.cmpi slt, %29, %cst_2 : tensor<1x2048xi64, #blocked>
|
38 |
+
%31 = arith.addi %29, %13 : tensor<1x2048xi64, #blocked>
|
39 |
+
%32 = tt.addptr %14, %31 : tensor<1x2048x!tt.ptr<f32, 1>, #blocked>, tensor<1x2048xi64, #blocked>
|
40 |
+
%33 = tt.load %32, %30, %cst_1 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x2048xf32, #blocked>
|
41 |
+
%34 = arith.mulf %33, %19 : tensor<1x2048xf32, #blocked>
|
42 |
+
%35 = arith.addf %arg10, %34 : tensor<1x2048xf32, #blocked>
|
43 |
+
%36 = arith.select %30, %35, %arg10 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked>
|
44 |
+
scf.yield %36 : tensor<1x2048xf32, #blocked>
|
45 |
+
}
|
46 |
+
%21 = "tt.reduce"(%20) <{axis = 1 : i32}> ({
|
47 |
+
^bb0(%arg9: f32, %arg10: f32):
|
48 |
+
%27 = arith.addf %arg9, %arg10 : f32
|
49 |
+
tt.reduce.return %27 : f32
|
50 |
+
}) : (tensor<1x2048xf32, #blocked>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
51 |
+
%22 = tt.expand_dims %21 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xf32, #blocked>
|
52 |
+
%23 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>
|
53 |
+
%24 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>
|
54 |
+
%25 = tt.broadcast %22 : (tensor<1x1xf32, #blocked>) -> tensor<1x2048xf32, #blocked>
|
55 |
+
%26 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>
|
56 |
+
scf.for %arg9 = %c0_i32 to %c50257_i32 step %c2048_i32 : i32 {
|
57 |
+
%27 = arith.extsi %arg9 : i32 to i64
|
58 |
+
%28 = tt.splat %27 : (i64) -> tensor<1x2048xi64, #blocked>
|
59 |
+
%29 = arith.addi %28, %4 : tensor<1x2048xi64, #blocked>
|
60 |
+
%30 = arith.cmpi slt, %29, %cst_2 : tensor<1x2048xi64, #blocked>
|
61 |
+
%31 = arith.addi %29, %13 : tensor<1x2048xi64, #blocked>
|
62 |
+
%32 = tt.addptr %23, %31 : tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>, tensor<1x2048xi64, #blocked>
|
63 |
+
%33 = tt.load %32, %30, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xbf16, #blocked>
|
64 |
+
%34 = arith.extf %33 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked>
|
65 |
+
%35 = tt.addptr %14, %31 : tensor<1x2048x!tt.ptr<f32, 1>, #blocked>, tensor<1x2048xi64, #blocked>
|
66 |
+
%36 = tt.load %35, %30, %cst_1 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xf32, #blocked>
|
67 |
+
%37 = tt.addptr %24, %31 : tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>, tensor<1x2048xi64, #blocked>
|
68 |
+
%38 = tt.load %37, %30, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xbf16, #blocked>
|
69 |
+
%39 = arith.extf %38 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked>
|
70 |
+
%40 = arith.mulf %36, %19 : tensor<1x2048xf32, #blocked>
|
71 |
+
%41 = math.exp %39 : tensor<1x2048xf32, #blocked>
|
72 |
+
%42 = arith.mulf %41, %25 : tensor<1x2048xf32, #blocked>
|
73 |
+
%43 = arith.subf %40, %42 : tensor<1x2048xf32, #blocked>
|
74 |
+
%44 = arith.addf %34, %43 : tensor<1x2048xf32, #blocked>
|
75 |
+
%45 = tt.addptr %26, %31 : tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>, tensor<1x2048xi64, #blocked>
|
76 |
+
%46 = arith.truncf %44 : tensor<1x2048xf32, #blocked> to tensor<1x2048xbf16, #blocked>
|
77 |
+
tt.store %45, %46, %30 {cache = 1 : i32, evict = 1 : i32} : tensor<1x2048xbf16, #blocked>
|
78 |
+
}
|
79 |
+
tt.return
|
80 |
+
}
|
81 |
+
}
|
.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.ttir
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16>
|
4 |
+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<1x1xf32>
|
5 |
+
%c50257_i64 = arith.constant 50257 : i64
|
6 |
+
%c50257_i32 = arith.constant 50257 : i32
|
7 |
+
%c2048_i32 = arith.constant 2048 : i32
|
8 |
+
%c0_i32 = arith.constant 0 : i32
|
9 |
+
%cst_1 = arith.constant dense<50257> : tensor<1x2048xi64>
|
10 |
+
%cst_2 = arith.constant dense<-1> : tensor<1x1xi64>
|
11 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32>
|
12 |
+
%0 = tt.get_program_id x : i32
|
13 |
+
%1 = arith.extsi %0 : i32 to i64
|
14 |
+
%2 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32>
|
15 |
+
%3 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<2048xi32>) -> tensor<1x2048xi32>
|
16 |
+
%4 = arith.extsi %3 : tensor<1x2048xi32> to tensor<1x2048xi64>
|
17 |
+
%5 = tt.addptr %arg1, %1 : !tt.ptr<i64, 1>, i64
|
18 |
+
%6 = tt.splat %5 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>>
|
19 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x1xi64>
|
20 |
+
%8 = tt.addptr %arg2, %c0_i32 : !tt.ptr<f32, 1>, i32
|
21 |
+
%9 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
|
22 |
+
%10 = tt.addptr %arg3, %c0_i32 : !tt.ptr<f32, 1>, i32
|
23 |
+
%11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
|
24 |
+
%12 = arith.muli %1, %c50257_i64 : i64
|
25 |
+
%13 = tt.splat %12 : (i64) -> tensor<1x2048xi64>
|
26 |
+
%14 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1x2048x!tt.ptr<f32, 1>>
|
27 |
+
%15 = arith.cmpi ne, %7, %cst_2 : tensor<1x1xi64>
|
28 |
+
%16 = arith.divf %9, %11 : f32
|
29 |
+
%17 = tt.splat %16 : (f32) -> tensor<1x1xf32>
|
30 |
+
%18 = arith.select %15, %17, %cst_0 : tensor<1x1xi1>, tensor<1x1xf32>
|
31 |
+
%19 = tt.broadcast %18 : (tensor<1x1xf32>) -> tensor<1x2048xf32>
|
32 |
+
%20 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c2048_i32 iter_args(%arg10 = %cst_3) -> (tensor<1x2048xf32>) : i32 {
|
33 |
+
%35 = arith.extsi %arg9 : i32 to i64
|
34 |
+
%36 = tt.splat %35 : (i64) -> tensor<1x2048xi64>
|
35 |
+
%37 = arith.addi %36, %4 : tensor<1x2048xi64>
|
36 |
+
%38 = arith.cmpi slt, %37, %cst_1 : tensor<1x2048xi64>
|
37 |
+
%39 = arith.addi %37, %13 : tensor<1x2048xi64>
|
38 |
+
%40 = tt.addptr %14, %39 : tensor<1x2048x!tt.ptr<f32, 1>>, tensor<1x2048xi64>
|
39 |
+
%41 = tt.load %40, %38, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x2048xf32>
|
40 |
+
%42 = arith.mulf %41, %19 : tensor<1x2048xf32>
|
41 |
+
%43 = arith.addf %arg10, %42 : tensor<1x2048xf32>
|
42 |
+
%44 = arith.select %38, %43, %arg10 : tensor<1x2048xi1>, tensor<1x2048xf32>
|
43 |
+
scf.yield %44 : tensor<1x2048xf32>
|
44 |
+
}
|
45 |
+
%21 = "tt.reduce"(%20) <{axis = 1 : i32}> ({
|
46 |
+
^bb0(%arg9: f32, %arg10: f32):
|
47 |
+
%35 = arith.addf %arg9, %arg10 : f32
|
48 |
+
tt.reduce.return %35 : f32
|
49 |
+
}) : (tensor<1x2048xf32>) -> tensor<1xf32>
|
50 |
+
%22 = tt.expand_dims %21 {axis = 1 : i32} : (tensor<1xf32>) -> tensor<1x1xf32>
|
51 |
+
%23 = arith.muli %1, %c50257_i64 : i64
|
52 |
+
%24 = tt.splat %23 : (i64) -> tensor<1x2048xi64>
|
53 |
+
%25 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>>
|
54 |
+
%26 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1x2048x!tt.ptr<f32, 1>>
|
55 |
+
%27 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>>
|
56 |
+
%28 = arith.cmpi ne, %7, %cst_2 : tensor<1x1xi64>
|
57 |
+
%29 = arith.divf %9, %11 : f32
|
58 |
+
%30 = tt.splat %29 : (f32) -> tensor<1x1xf32>
|
59 |
+
%31 = arith.select %28, %30, %cst_0 : tensor<1x1xi1>, tensor<1x1xf32>
|
60 |
+
%32 = tt.broadcast %31 : (tensor<1x1xf32>) -> tensor<1x2048xf32>
|
61 |
+
%33 = tt.broadcast %22 : (tensor<1x1xf32>) -> tensor<1x2048xf32>
|
62 |
+
%34 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>>
|
63 |
+
scf.for %arg9 = %c0_i32 to %c50257_i32 step %c2048_i32 : i32 {
|
64 |
+
%35 = arith.extsi %arg9 : i32 to i64
|
65 |
+
%36 = tt.splat %35 : (i64) -> tensor<1x2048xi64>
|
66 |
+
%37 = arith.addi %36, %4 : tensor<1x2048xi64>
|
67 |
+
%38 = arith.cmpi slt, %37, %cst_1 : tensor<1x2048xi64>
|
68 |
+
%39 = arith.addi %37, %24 : tensor<1x2048xi64>
|
69 |
+
%40 = tt.addptr %25, %39 : tensor<1x2048x!tt.ptr<bf16, 1>>, tensor<1x2048xi64>
|
70 |
+
%41 = tt.load %40, %38, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xbf16>
|
71 |
+
%42 = arith.extf %41 : tensor<1x2048xbf16> to tensor<1x2048xf32>
|
72 |
+
%43 = tt.addptr %26, %39 : tensor<1x2048x!tt.ptr<f32, 1>>, tensor<1x2048xi64>
|
73 |
+
%44 = tt.load %43, %38, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xf32>
|
74 |
+
%45 = tt.addptr %27, %39 : tensor<1x2048x!tt.ptr<bf16, 1>>, tensor<1x2048xi64>
|
75 |
+
%46 = tt.load %45, %38, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xbf16>
|
76 |
+
%47 = arith.extf %46 : tensor<1x2048xbf16> to tensor<1x2048xf32>
|
77 |
+
%48 = arith.mulf %44, %32 : tensor<1x2048xf32>
|
78 |
+
%49 = math.exp %47 : tensor<1x2048xf32>
|
79 |
+
%50 = arith.mulf %49, %33 : tensor<1x2048xf32>
|
80 |
+
%51 = arith.subf %48, %50 : tensor<1x2048xf32>
|
81 |
+
%52 = arith.addf %42, %51 : tensor<1x2048xf32>
|
82 |
+
%53 = tt.addptr %34, %39 : tensor<1x2048x!tt.ptr<bf16, 1>>, tensor<1x2048xi64>
|
83 |
+
%54 = arith.truncf %52 : tensor<1x2048xf32> to tensor<1x2048xbf16>
|
84 |
+
tt.store %53, %54, %38 {cache = 1 : i32, evict = 1 : i32} : tensor<1x2048xbf16>
|
85 |
+
}
|
86 |
+
tt.return
|
87 |
+
}
|
88 |
+
}
|
.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.cubin
ADDED
Binary file (5.16 kB). View file
|
|
.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.llir
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
|
5 |
+
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
6 |
+
%5 = shl i32 %4, 1, !dbg !8
|
7 |
+
%6 = and i32 %5, 510, !dbg !8
|
8 |
+
%7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
|
9 |
+
%8 = shl i32 %7, 9, !dbg !10
|
10 |
+
%9 = or i32 %8, %6, !dbg !11
|
11 |
+
%10 = icmp slt i32 %9, 12865792, !dbg !12
|
12 |
+
%11 = sext i32 %9 to i64, !dbg !13
|
13 |
+
%12 = getelementptr i16, ptr addrspace(1) %0, i64 %11, !dbg !13
|
14 |
+
%13 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %12, i1 %10) #1, !dbg !14
|
15 |
+
%14 = trunc i32 %13 to i16, !dbg !14
|
16 |
+
%extelt.offset = lshr i32 %13, 16, !dbg !14
|
17 |
+
%15 = trunc i32 %extelt.offset to i16, !dbg !14
|
18 |
+
%16 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %14) #1, !dbg !15
|
19 |
+
%17 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %15) #1, !dbg !15
|
20 |
+
%18 = getelementptr float, ptr addrspace(1) %1, i64 %11, !dbg !16
|
21 |
+
%19 = bitcast float %16 to i32, !dbg !17
|
22 |
+
%20 = bitcast float %17 to i32, !dbg !17
|
23 |
+
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %19, i32 %20, ptr addrspace(1) %18, i1 %10) #1, !dbg !17
|
24 |
+
ret void, !dbg !18
|
25 |
+
}
|
26 |
+
|
27 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
28 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
29 |
+
|
30 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
31 |
+
attributes #1 = { nounwind }
|
32 |
+
|
33 |
+
!llvm.module.flags = !{!0}
|
34 |
+
!llvm.dbg.cu = !{!1}
|
35 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
36 |
+
|
37 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
38 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
39 |
+
!2 = !DIFile(filename: "cmxm2obucqff2z4vc55zcnscfuvur5s2b3e36dvgm57qobanlpho.py", directory: "/tmp/torchinductor_root/mx")
|
40 |
+
!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
|
41 |
+
!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 256}
|
42 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
43 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
44 |
+
!7 = !{}
|
45 |
+
!8 = !DILocation(line: 21, column: 36, scope: !5)
|
46 |
+
!9 = !DILocation(line: 20, column: 28, scope: !5)
|
47 |
+
!10 = !DILocation(line: 20, column: 33, scope: !5)
|
48 |
+
!11 = !DILocation(line: 21, column: 23, scope: !5)
|
49 |
+
!12 = !DILocation(line: 22, column: 21, scope: !5)
|
50 |
+
!13 = !DILocation(line: 24, column: 30, scope: !5)
|
51 |
+
!14 = !DILocation(line: 24, column: 35, scope: !5)
|
52 |
+
!15 = !DILocation(line: 24, column: 45, scope: !5)
|
53 |
+
!16 = !DILocation(line: 26, column: 25, scope: !5)
|
54 |
+
!17 = !DILocation(line: 26, column: 36, scope: !5)
|
55 |
+
!18 = !DILocation(line: 26, column: 4, scope: !5)
|
.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.ptx
ADDED
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2de
|
10 |
+
|
11 |
+
.visible .entry triton__0d1d2de(
|
12 |
+
.param .u64 triton__0d1d2de_param_0,
|
13 |
+
.param .u64 triton__0d1d2de_param_1,
|
14 |
+
.param .u32 triton__0d1d2de_param_2
|
15 |
+
)
|
16 |
+
.maxntid 256, 1, 1
|
17 |
+
{
|
18 |
+
.reg .pred %p<3>;
|
19 |
+
.reg .b16 %rs<3>;
|
20 |
+
.reg .b32 %r<12>;
|
21 |
+
.reg .b64 %rd<7>;
|
22 |
+
.loc 1 18 0
|
23 |
+
$L__func_begin0:
|
24 |
+
.loc 1 18 0
|
25 |
+
|
26 |
+
ld.param.u64 %rd3, [triton__0d1d2de_param_0];
|
27 |
+
ld.param.u64 %rd4, [triton__0d1d2de_param_1];
|
28 |
+
$L__tmp0:
|
29 |
+
.loc 1 21 36
|
30 |
+
mov.u32 %r7, %tid.x;
|
31 |
+
shl.b32 %r8, %r7, 1;
|
32 |
+
and.b32 %r9, %r8, 510;
|
33 |
+
.loc 1 20 28
|
34 |
+
mov.u32 %r1, %ctaid.x;
|
35 |
+
.loc 1 20 33
|
36 |
+
shl.b32 %r10, %r1, 9;
|
37 |
+
.loc 1 21 23
|
38 |
+
or.b32 %r11, %r10, %r9;
|
39 |
+
.loc 1 22 21
|
40 |
+
setp.lt.s32 %p1, %r11, 12865792;
|
41 |
+
.loc 1 24 30
|
42 |
+
mul.wide.s32 %rd5, %r11, 2;
|
43 |
+
add.s64 %rd1, %rd3, %rd5;
|
44 |
+
.loc 1 24 35
|
45 |
+
mov.u32 %r2, 0x0;
|
46 |
+
@%p1 ld.global.b32 { %r2 }, [ %rd1 + 0 ];
|
47 |
+
cvt.u16.u32 %rs1, %r2;
|
48 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
|
49 |
+
.loc 1 24 45
|
50 |
+
cvt.f32.bf16 %r5, %rs1;
|
51 |
+
cvt.f32.bf16 %r6, %rs2;
|
52 |
+
.loc 1 26 25
|
53 |
+
mul.wide.s32 %rd6, %r11, 4;
|
54 |
+
add.s64 %rd2, %rd4, %rd6;
|
55 |
+
.loc 1 26 36
|
56 |
+
@%p1 st.global.v2.b32 [ %rd2 + 0 ], { %r5, %r6 };
|
57 |
+
.loc 1 26 4
|
58 |
+
ret;
|
59 |
+
$L__tmp1:
|
60 |
+
$L__func_end0:
|
61 |
+
|
62 |
+
}
|
63 |
+
.file 1 "/tmp/torchinductor_root/mx/cmxm2obucqff2z4vc55zcnscfuvur5s2b3e36dvgm57qobanlpho.py"
|
64 |
+
.section .debug_abbrev
|
65 |
+
{
|
66 |
+
.b8 1
|
67 |
+
.b8 17
|
68 |
+
.b8 1
|
69 |
+
.b8 37
|
70 |
+
.b8 8
|
71 |
+
.b8 19
|
72 |
+
.b8 5
|
73 |
+
.b8 3
|
74 |
+
.b8 8
|
75 |
+
.b8 16
|
76 |
+
.b8 6
|
77 |
+
.b8 27
|
78 |
+
.b8 8
|
79 |
+
.b8 180
|
80 |
+
.b8 66
|
81 |
+
.b8 12
|
82 |
+
.b8 17
|
83 |
+
.b8 1
|
84 |
+
.b8 18
|
85 |
+
.b8 1
|
86 |
+
.b8 0
|
87 |
+
.b8 0
|
88 |
+
.b8 2
|
89 |
+
.b8 46
|
90 |
+
.b8 0
|
91 |
+
.b8 17
|
92 |
+
.b8 1
|
93 |
+
.b8 18
|
94 |
+
.b8 1
|
95 |
+
.b8 64
|
96 |
+
.b8 10
|
97 |
+
.b8 135
|
98 |
+
.b8 64
|
99 |
+
.b8 8
|
100 |
+
.b8 3
|
101 |
+
.b8 8
|
102 |
+
.b8 58
|
103 |
+
.b8 11
|
104 |
+
.b8 59
|
105 |
+
.b8 11
|
106 |
+
.b8 63
|
107 |
+
.b8 12
|
108 |
+
.b8 0
|
109 |
+
.b8 0
|
110 |
+
.b8 0
|
111 |
+
}
|
112 |
+
.section .debug_info
|
113 |
+
{
|
114 |
+
.b32 176
|
115 |
+
.b8 2
|
116 |
+
.b8 0
|
117 |
+
.b32 .debug_abbrev
|
118 |
+
.b8 8
|
119 |
+
.b8 1
|
120 |
+
.b8 116
|
121 |
+
.b8 114
|
122 |
+
.b8 105
|
123 |
+
.b8 116
|
124 |
+
.b8 111
|
125 |
+
.b8 110
|
126 |
+
.b8 0
|
127 |
+
.b8 2
|
128 |
+
.b8 0
|
129 |
+
.b8 99
|
130 |
+
.b8 109
|
131 |
+
.b8 120
|
132 |
+
.b8 109
|
133 |
+
.b8 50
|
134 |
+
.b8 111
|
135 |
+
.b8 98
|
136 |
+
.b8 117
|
137 |
+
.b8 99
|
138 |
+
.b8 113
|
139 |
+
.b8 102
|
140 |
+
.b8 102
|
141 |
+
.b8 50
|
142 |
+
.b8 122
|
143 |
+
.b8 52
|
144 |
+
.b8 118
|
145 |
+
.b8 99
|
146 |
+
.b8 53
|
147 |
+
.b8 53
|
148 |
+
.b8 122
|
149 |
+
.b8 99
|
150 |
+
.b8 110
|
151 |
+
.b8 115
|
152 |
+
.b8 99
|
153 |
+
.b8 102
|
154 |
+
.b8 117
|
155 |
+
.b8 118
|
156 |
+
.b8 117
|
157 |
+
.b8 114
|
158 |
+
.b8 53
|
159 |
+
.b8 115
|
160 |
+
.b8 50
|
161 |
+
.b8 98
|
162 |
+
.b8 51
|
163 |
+
.b8 101
|
164 |
+
.b8 51
|
165 |
+
.b8 54
|
166 |
+
.b8 100
|
167 |
+
.b8 118
|
168 |
+
.b8 103
|
169 |
+
.b8 109
|
170 |
+
.b8 53
|
171 |
+
.b8 55
|
172 |
+
.b8 113
|
173 |
+
.b8 111
|
174 |
+
.b8 98
|
175 |
+
.b8 97
|
176 |
+
.b8 110
|
177 |
+
.b8 108
|
178 |
+
.b8 112
|
179 |
+
.b8 104
|
180 |
+
.b8 111
|
181 |
+
.b8 46
|
182 |
+
.b8 112
|
183 |
+
.b8 121
|
184 |
+
.b8 0
|
185 |
+
.b32 .debug_line
|
186 |
+
.b8 47
|
187 |
+
.b8 116
|
188 |
+
.b8 109
|
189 |
+
.b8 112
|
190 |
+
.b8 47
|
191 |
+
.b8 116
|
192 |
+
.b8 111
|
193 |
+
.b8 114
|
194 |
+
.b8 99
|
195 |
+
.b8 104
|
196 |
+
.b8 105
|
197 |
+
.b8 110
|
198 |
+
.b8 100
|
199 |
+
.b8 117
|
200 |
+
.b8 99
|
201 |
+
.b8 116
|
202 |
+
.b8 111
|
203 |
+
.b8 114
|
204 |
+
.b8 95
|
205 |
+
.b8 114
|
206 |
+
.b8 111
|
207 |
+
.b8 111
|
208 |
+
.b8 116
|
209 |
+
.b8 47
|
210 |
+
.b8 109
|
211 |
+
.b8 120
|
212 |
+
.b8 0
|
213 |
+
.b8 1
|
214 |
+
.b64 $L__func_begin0
|
215 |
+
.b64 $L__func_end0
|
216 |
+
.b8 2
|
217 |
+
.b64 $L__func_begin0
|
218 |
+
.b64 $L__func_end0
|
219 |
+
.b8 1
|
220 |
+
.b8 156
|
221 |
+
.b8 116
|
222 |
+
.b8 114
|
223 |
+
.b8 105
|
224 |
+
.b8 116
|
225 |
+
.b8 111
|
226 |
+
.b8 110
|
227 |
+
.b8 95
|
228 |
+
.b8 95
|
229 |
+
.b8 48
|
230 |
+
.b8 100
|
231 |
+
.b8 49
|
232 |
+
.b8 100
|
233 |
+
.b8 50
|
234 |
+
.b8 100
|
235 |
+
.b8 101
|
236 |
+
.b8 0
|
237 |
+
.b8 116
|
238 |
+
.b8 114
|
239 |
+
.b8 105
|
240 |
+
.b8 116
|
241 |
+
.b8 111
|
242 |
+
.b8 110
|
243 |
+
.b8 95
|
244 |
+
.b8 95
|
245 |
+
.b8 48
|
246 |
+
.b8 100
|
247 |
+
.b8 49
|
248 |
+
.b8 100
|
249 |
+
.b8 50
|
250 |
+
.b8 100
|
251 |
+
.b8 101
|
252 |
+
.b8 0
|
253 |
+
.b8 1
|
254 |
+
.b8 18
|
255 |
+
.b8 1
|
256 |
+
.b8 0
|
257 |
+
}
|
258 |
+
.section .debug_pubnames
|
259 |
+
{
|
260 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
261 |
+
$L__pubNames_start0:
|
262 |
+
.b8 2
|
263 |
+
.b8 0
|
264 |
+
.b32 .debug_info
|
265 |
+
.b32 180
|
266 |
+
.b32 125
|
267 |
+
.b8 116
|
268 |
+
.b8 114
|
269 |
+
.b8 105
|
270 |
+
.b8 116
|
271 |
+
.b8 111
|
272 |
+
.b8 110
|
273 |
+
.b8 95
|
274 |
+
.b8 95
|
275 |
+
.b8 48
|
276 |
+
.b8 100
|
277 |
+
.b8 49
|
278 |
+
.b8 100
|
279 |
+
.b8 50
|
280 |
+
.b8 100
|
281 |
+
.b8 101
|
282 |
+
.b8 0
|
283 |
+
.b32 0
|
284 |
+
$L__pubNames_end0:
|
285 |
+
}
|
286 |
+
.section .debug_pubtypes
|
287 |
+
{
|
288 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
289 |
+
$L__pubTypes_start0:
|
290 |
+
.b8 2
|
291 |
+
.b8 0
|
292 |
+
.b32 .debug_info
|
293 |
+
.b32 180
|
294 |
+
.b32 0
|
295 |
+
$L__pubTypes_end0:
|
296 |
+
}
|
297 |
+
.section .debug_loc { }
|
.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.ttgir
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<12865792> : tensor<512xi32, #blocked>
|
5 |
+
%c512_i32 = arith.constant 512 : i32
|
6 |
+
%0 = tt.get_program_id x : i32
|
7 |
+
%1 = arith.muli %0, %c512_i32 : i32
|
8 |
+
%2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
|
9 |
+
%3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
|
10 |
+
%4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
|
11 |
+
%5 = arith.cmpi slt, %4, %cst : tensor<512xi32, #blocked>
|
12 |
+
%6 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
|
13 |
+
%7 = tt.addptr %6, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
|
14 |
+
%8 = tt.load %7, %5 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16, #blocked>
|
15 |
+
%9 = arith.extf %8 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked>
|
16 |
+
%10 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
|
17 |
+
%11 = tt.addptr %10, %4 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi32, #blocked>
|
18 |
+
tt.store %11, %9, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked>
|
19 |
+
tt.return
|
20 |
+
}
|
21 |
+
}
|
.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.ttir
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<12865792> : tensor<512xi32>
|
4 |
+
%c512_i32 = arith.constant 512 : i32
|
5 |
+
%0 = tt.get_program_id x : i32
|
6 |
+
%1 = arith.muli %0, %c512_i32 : i32
|
7 |
+
%2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
|
8 |
+
%3 = tt.splat %1 : (i32) -> tensor<512xi32>
|
9 |
+
%4 = arith.addi %3, %2 : tensor<512xi32>
|
10 |
+
%5 = arith.cmpi slt, %4, %cst : tensor<512xi32>
|
11 |
+
%6 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
|
12 |
+
%7 = tt.addptr %6, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
|
13 |
+
%8 = tt.load %7, %5 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16>
|
14 |
+
%9 = arith.extf %8 : tensor<512xbf16> to tensor<512xf32>
|
15 |
+
%10 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>>
|
16 |
+
%11 = tt.addptr %10, %4 : tensor<512x!tt.ptr<f32, 1>>, tensor<512xi32>
|
17 |
+
tt.store %11, %9, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32>
|
18 |
+
tt.return
|
19 |
+
}
|
20 |
+
}
|
.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.cubin
ADDED
Binary file (36.4 kB). View file
|
|
.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.cubin
ADDED
Binary file (4.65 kB). View file
|
|
.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.llir
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
define void @triton__0d1de(ptr addrspace(1) %0, i64 %1) local_unnamed_addr !dbg !5 {
|
5 |
+
%3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
6 |
+
%4 = shl i32 %3, 2, !dbg !8
|
7 |
+
%5 = and i32 %4, 508, !dbg !8
|
8 |
+
%6 = or i32 %5, 512, !dbg !8
|
9 |
+
%7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
|
10 |
+
%8 = sext i32 %7 to i64, !dbg !10
|
11 |
+
%9 = shl nsw i64 %8, 10, !dbg !11
|
12 |
+
%10 = zext nneg i32 %5 to i64
|
13 |
+
%11 = zext nneg i32 %6 to i64
|
14 |
+
%12 = or i64 %9, %10, !dbg !12
|
15 |
+
%13 = or i64 %9, %11, !dbg !12
|
16 |
+
%14 = getelementptr float, ptr addrspace(1) %0, i64 %12, !dbg !13
|
17 |
+
%15 = getelementptr float, ptr addrspace(1) %0, i64 %13, !dbg !13
|
18 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %14, i1 true) #1, !dbg !14
|
19 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %15, i1 true) #1, !dbg !14
|
20 |
+
ret void, !dbg !15
|
21 |
+
}
|
22 |
+
|
23 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
24 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
25 |
+
|
26 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
27 |
+
attributes #1 = { nounwind }
|
28 |
+
|
29 |
+
!llvm.module.flags = !{!0}
|
30 |
+
!llvm.dbg.cu = !{!1}
|
31 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
32 |
+
|
33 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
34 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
35 |
+
!2 = !DIFile(filename: "cpkw3bdoamlgzvqjeyuk34b3jcjf57htisara7lukflexo3t22ew.py", directory: "/tmp/torchinductor_root/pk")
|
36 |
+
!3 = !{ptr @triton__0d1de, !"kernel", i32 1}
|
37 |
+
!4 = !{ptr @triton__0d1de, !"maxntidx", i32 128}
|
38 |
+
!5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
39 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
40 |
+
!7 = !{}
|
41 |
+
!8 = !DILocation(line: 21, column: 36, scope: !5)
|
42 |
+
!9 = !DILocation(line: 20, column: 28, scope: !5)
|
43 |
+
!10 = !DILocation(line: 20, column: 34, scope: !5)
|
44 |
+
!11 = !DILocation(line: 20, column: 46, scope: !5)
|
45 |
+
!12 = !DILocation(line: 21, column: 23, scope: !5)
|
46 |
+
!13 = !DILocation(line: 25, column: 25, scope: !5)
|
47 |
+
!14 = !DILocation(line: 25, column: 36, scope: !5)
|
48 |
+
!15 = !DILocation(line: 25, column: 4, scope: !5)
|
.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ptx
ADDED
@@ -0,0 +1,280 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1de
|
10 |
+
|
11 |
+
.visible .entry triton__0d1de(
|
12 |
+
.param .u64 triton__0d1de_param_0,
|
13 |
+
.param .u64 triton__0d1de_param_1
|
14 |
+
)
|
15 |
+
.maxntid 128, 1, 1
|
16 |
+
{
|
17 |
+
.reg .pred %p<3>;
|
18 |
+
.reg .b32 %r<13>;
|
19 |
+
.reg .b64 %rd<8>;
|
20 |
+
.loc 1 18 0
|
21 |
+
$L__func_begin0:
|
22 |
+
.loc 1 18 0
|
23 |
+
|
24 |
+
ld.param.u64 %rd3, [triton__0d1de_param_0];
|
25 |
+
$L__tmp0:
|
26 |
+
.loc 1 21 36
|
27 |
+
mov.u32 %r10, %tid.x;
|
28 |
+
shl.b32 %r11, %r10, 2;
|
29 |
+
and.b32 %r12, %r11, 508;
|
30 |
+
.loc 1 20 28
|
31 |
+
mov.u32 %r1, %ctaid.x;
|
32 |
+
.loc 1 20 46
|
33 |
+
mul.wide.s32 %rd4, %r1, 1024;
|
34 |
+
cvt.u64.u32 %rd5, %r12;
|
35 |
+
.loc 1 21 23
|
36 |
+
or.b64 %rd6, %rd4, %rd5;
|
37 |
+
.loc 1 25 25
|
38 |
+
shl.b64 %rd7, %rd6, 2;
|
39 |
+
add.s64 %rd1, %rd3, %rd7;
|
40 |
+
add.s64 %rd2, %rd1, 2048;
|
41 |
+
mov.b32 %r2, 0;
|
42 |
+
mov.pred %p1, -1;
|
43 |
+
.loc 1 25 36
|
44 |
+
@%p1 st.global.v4.b32 [ %rd1 + 0 ], { %r2, %r2, %r2, %r2 };
|
45 |
+
@%p1 st.global.v4.b32 [ %rd2 + 0 ], { %r2, %r2, %r2, %r2 };
|
46 |
+
.loc 1 25 4
|
47 |
+
ret;
|
48 |
+
$L__tmp1:
|
49 |
+
$L__func_end0:
|
50 |
+
|
51 |
+
}
|
52 |
+
.file 1 "/tmp/torchinductor_root/pk/cpkw3bdoamlgzvqjeyuk34b3jcjf57htisara7lukflexo3t22ew.py"
|
53 |
+
.section .debug_abbrev
|
54 |
+
{
|
55 |
+
.b8 1
|
56 |
+
.b8 17
|
57 |
+
.b8 1
|
58 |
+
.b8 37
|
59 |
+
.b8 8
|
60 |
+
.b8 19
|
61 |
+
.b8 5
|
62 |
+
.b8 3
|
63 |
+
.b8 8
|
64 |
+
.b8 16
|
65 |
+
.b8 6
|
66 |
+
.b8 27
|
67 |
+
.b8 8
|
68 |
+
.b8 180
|
69 |
+
.b8 66
|
70 |
+
.b8 12
|
71 |
+
.b8 17
|
72 |
+
.b8 1
|
73 |
+
.b8 18
|
74 |
+
.b8 1
|
75 |
+
.b8 0
|
76 |
+
.b8 0
|
77 |
+
.b8 2
|
78 |
+
.b8 46
|
79 |
+
.b8 0
|
80 |
+
.b8 17
|
81 |
+
.b8 1
|
82 |
+
.b8 18
|
83 |
+
.b8 1
|
84 |
+
.b8 64
|
85 |
+
.b8 10
|
86 |
+
.b8 135
|
87 |
+
.b8 64
|
88 |
+
.b8 8
|
89 |
+
.b8 3
|
90 |
+
.b8 8
|
91 |
+
.b8 58
|
92 |
+
.b8 11
|
93 |
+
.b8 59
|
94 |
+
.b8 11
|
95 |
+
.b8 63
|
96 |
+
.b8 12
|
97 |
+
.b8 0
|
98 |
+
.b8 0
|
99 |
+
.b8 0
|
100 |
+
}
|
101 |
+
.section .debug_info
|
102 |
+
{
|
103 |
+
.b32 172
|
104 |
+
.b8 2
|
105 |
+
.b8 0
|
106 |
+
.b32 .debug_abbrev
|
107 |
+
.b8 8
|
108 |
+
.b8 1
|
109 |
+
.b8 116
|
110 |
+
.b8 114
|
111 |
+
.b8 105
|
112 |
+
.b8 116
|
113 |
+
.b8 111
|
114 |
+
.b8 110
|
115 |
+
.b8 0
|
116 |
+
.b8 2
|
117 |
+
.b8 0
|
118 |
+
.b8 99
|
119 |
+
.b8 112
|
120 |
+
.b8 107
|
121 |
+
.b8 119
|
122 |
+
.b8 51
|
123 |
+
.b8 98
|
124 |
+
.b8 100
|
125 |
+
.b8 111
|
126 |
+
.b8 97
|
127 |
+
.b8 109
|
128 |
+
.b8 108
|
129 |
+
.b8 103
|
130 |
+
.b8 122
|
131 |
+
.b8 118
|
132 |
+
.b8 113
|
133 |
+
.b8 106
|
134 |
+
.b8 101
|
135 |
+
.b8 121
|
136 |
+
.b8 117
|
137 |
+
.b8 107
|
138 |
+
.b8 51
|
139 |
+
.b8 52
|
140 |
+
.b8 98
|
141 |
+
.b8 51
|
142 |
+
.b8 106
|
143 |
+
.b8 99
|
144 |
+
.b8 106
|
145 |
+
.b8 102
|
146 |
+
.b8 53
|
147 |
+
.b8 55
|
148 |
+
.b8 104
|
149 |
+
.b8 116
|
150 |
+
.b8 105
|
151 |
+
.b8 115
|
152 |
+
.b8 97
|
153 |
+
.b8 114
|
154 |
+
.b8 97
|
155 |
+
.b8 55
|
156 |
+
.b8 108
|
157 |
+
.b8 117
|
158 |
+
.b8 107
|
159 |
+
.b8 102
|
160 |
+
.b8 108
|
161 |
+
.b8 101
|
162 |
+
.b8 120
|
163 |
+
.b8 111
|
164 |
+
.b8 51
|
165 |
+
.b8 116
|
166 |
+
.b8 50
|
167 |
+
.b8 50
|
168 |
+
.b8 101
|
169 |
+
.b8 119
|
170 |
+
.b8 46
|
171 |
+
.b8 112
|
172 |
+
.b8 121
|
173 |
+
.b8 0
|
174 |
+
.b32 .debug_line
|
175 |
+
.b8 47
|
176 |
+
.b8 116
|
177 |
+
.b8 109
|
178 |
+
.b8 112
|
179 |
+
.b8 47
|
180 |
+
.b8 116
|
181 |
+
.b8 111
|
182 |
+
.b8 114
|
183 |
+
.b8 99
|
184 |
+
.b8 104
|
185 |
+
.b8 105
|
186 |
+
.b8 110
|
187 |
+
.b8 100
|
188 |
+
.b8 117
|
189 |
+
.b8 99
|
190 |
+
.b8 116
|
191 |
+
.b8 111
|
192 |
+
.b8 114
|
193 |
+
.b8 95
|
194 |
+
.b8 114
|
195 |
+
.b8 111
|
196 |
+
.b8 111
|
197 |
+
.b8 116
|
198 |
+
.b8 47
|
199 |
+
.b8 112
|
200 |
+
.b8 107
|
201 |
+
.b8 0
|
202 |
+
.b8 1
|
203 |
+
.b64 $L__func_begin0
|
204 |
+
.b64 $L__func_end0
|
205 |
+
.b8 2
|
206 |
+
.b64 $L__func_begin0
|
207 |
+
.b64 $L__func_end0
|
208 |
+
.b8 1
|
209 |
+
.b8 156
|
210 |
+
.b8 116
|
211 |
+
.b8 114
|
212 |
+
.b8 105
|
213 |
+
.b8 116
|
214 |
+
.b8 111
|
215 |
+
.b8 110
|
216 |
+
.b8 95
|
217 |
+
.b8 95
|
218 |
+
.b8 48
|
219 |
+
.b8 100
|
220 |
+
.b8 49
|
221 |
+
.b8 100
|
222 |
+
.b8 101
|
223 |
+
.b8 0
|
224 |
+
.b8 116
|
225 |
+
.b8 114
|
226 |
+
.b8 105
|
227 |
+
.b8 116
|
228 |
+
.b8 111
|
229 |
+
.b8 110
|
230 |
+
.b8 95
|
231 |
+
.b8 95
|
232 |
+
.b8 48
|
233 |
+
.b8 100
|
234 |
+
.b8 49
|
235 |
+
.b8 100
|
236 |
+
.b8 101
|
237 |
+
.b8 0
|
238 |
+
.b8 1
|
239 |
+
.b8 18
|
240 |
+
.b8 1
|
241 |
+
.b8 0
|
242 |
+
}
|
243 |
+
.section .debug_pubnames
|
244 |
+
{
|
245 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
246 |
+
$L__pubNames_start0:
|
247 |
+
.b8 2
|
248 |
+
.b8 0
|
249 |
+
.b32 .debug_info
|
250 |
+
.b32 176
|
251 |
+
.b32 125
|
252 |
+
.b8 116
|
253 |
+
.b8 114
|
254 |
+
.b8 105
|
255 |
+
.b8 116
|
256 |
+
.b8 111
|
257 |
+
.b8 110
|
258 |
+
.b8 95
|
259 |
+
.b8 95
|
260 |
+
.b8 48
|
261 |
+
.b8 100
|
262 |
+
.b8 49
|
263 |
+
.b8 100
|
264 |
+
.b8 101
|
265 |
+
.b8 0
|
266 |
+
.b32 0
|
267 |
+
$L__pubNames_end0:
|
268 |
+
}
|
269 |
+
.section .debug_pubtypes
|
270 |
+
{
|
271 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
272 |
+
$L__pubTypes_start0:
|
273 |
+
.b8 2
|
274 |
+
.b8 0
|
275 |
+
.b32 .debug_info
|
276 |
+
.b32 176
|
277 |
+
.b32 0
|
278 |
+
$L__pubTypes_end0:
|
279 |
+
}
|
280 |
+
.section .debug_loc { }
|
.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ttgir
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked>
|
5 |
+
%c1024_i64 = arith.constant 1024 : i64
|
6 |
+
%0 = tt.get_program_id x : i32
|
7 |
+
%1 = arith.extsi %0 : i32 to i64
|
8 |
+
%2 = arith.muli %1, %c1024_i64 : i64
|
9 |
+
%3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
|
10 |
+
%4 = arith.extsi %3 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked>
|
11 |
+
%5 = tt.splat %2 : (i64) -> tensor<1024xi64, #blocked>
|
12 |
+
%6 = arith.addi %5, %4 : tensor<1024xi64, #blocked>
|
13 |
+
%7 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked>
|
14 |
+
%8 = tt.addptr %7, %6 : tensor<1024x!tt.ptr<f32, 1>, #blocked>, tensor<1024xi64, #blocked>
|
15 |
+
tt.store %8, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked>
|
16 |
+
tt.return
|
17 |
+
}
|
18 |
+
}
|
.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ttir
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%c1024_i64 = arith.constant 1024 : i64
|
4 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<1024xf32>
|
5 |
+
%0 = tt.get_program_id x : i32
|
6 |
+
%1 = arith.extsi %0 : i32 to i64
|
7 |
+
%2 = arith.muli %1, %c1024_i64 : i64
|
8 |
+
%3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
|
9 |
+
%4 = arith.extsi %3 : tensor<1024xi32> to tensor<1024xi64>
|
10 |
+
%5 = tt.splat %2 : (i64) -> tensor<1024xi64>
|
11 |
+
%6 = arith.addi %5, %4 : tensor<1024xi64>
|
12 |
+
%7 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
|
13 |
+
%8 = tt.addptr %7, %6 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi64>
|
14 |
+
tt.store %8, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32>
|
15 |
+
tt.return
|
16 |
+
}
|
17 |
+
}
|
.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.llir
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
|
5 |
+
|
6 |
+
define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
|
7 |
+
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
8 |
+
%5 = and i32 %4, 127, !dbg !8
|
9 |
+
%6 = shl nuw nsw i32 %5, 3, !dbg !8
|
10 |
+
%7 = shl nuw nsw i32 %5, 2, !dbg !8
|
11 |
+
%8 = or i32 %7, 512, !dbg !8
|
12 |
+
%9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !9
|
13 |
+
%10 = shl i32 %9, 10, !dbg !10
|
14 |
+
%11 = or i32 %10, %6, !dbg !11
|
15 |
+
%12 = or i32 %10, %7, !dbg !11
|
16 |
+
%13 = or i32 %10, %8, !dbg !11
|
17 |
+
%14 = icmp slt i32 %11, 12865792, !dbg !12
|
18 |
+
%15 = icmp slt i32 %12, 12865792, !dbg !12
|
19 |
+
%16 = icmp slt i32 %13, 12865792, !dbg !12
|
20 |
+
%17 = sext i32 %11 to i64, !dbg !13
|
21 |
+
%18 = getelementptr i16, ptr addrspace(1) %0, i64 %17, !dbg !13
|
22 |
+
%19 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %18, i1 %14) #2, !dbg !14
|
23 |
+
%20 = extractvalue { i32, i32, i32, i32 } %19, 0, !dbg !14
|
24 |
+
%21 = extractvalue { i32, i32, i32, i32 } %19, 1, !dbg !14
|
25 |
+
%22 = extractvalue { i32, i32, i32, i32 } %19, 2, !dbg !14
|
26 |
+
%23 = extractvalue { i32, i32, i32, i32 } %19, 3, !dbg !14
|
27 |
+
%24 = trunc i32 %20 to i16, !dbg !14
|
28 |
+
%extelt.offset = lshr i32 %20, 16, !dbg !14
|
29 |
+
%25 = trunc i32 %extelt.offset to i16, !dbg !14
|
30 |
+
%26 = trunc i32 %21 to i16, !dbg !14
|
31 |
+
%extelt.offset1 = lshr i32 %21, 16, !dbg !14
|
32 |
+
%27 = trunc i32 %extelt.offset1 to i16, !dbg !14
|
33 |
+
%28 = trunc i32 %22 to i16, !dbg !14
|
34 |
+
%extelt.offset2 = lshr i32 %22, 16, !dbg !14
|
35 |
+
%29 = trunc i32 %extelt.offset2 to i16, !dbg !14
|
36 |
+
%30 = trunc i32 %23 to i16, !dbg !14
|
37 |
+
%extelt.offset3 = lshr i32 %23, 16, !dbg !14
|
38 |
+
%31 = trunc i32 %extelt.offset3 to i16, !dbg !14
|
39 |
+
%32 = zext nneg i32 %6 to i64, !dbg !15
|
40 |
+
%33 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %32, !dbg !15
|
41 |
+
%34 = insertelement <1 x i16> undef, i16 %24, i64 0, !dbg !15
|
42 |
+
store <1 x i16> %34, ptr addrspace(3) %33, align 2, !dbg !15
|
43 |
+
%35 = or i32 %6, 1, !dbg !15
|
44 |
+
%36 = zext nneg i32 %35 to i64, !dbg !15
|
45 |
+
%37 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %36, !dbg !15
|
46 |
+
%38 = insertelement <1 x i16> undef, i16 %25, i64 0, !dbg !15
|
47 |
+
store <1 x i16> %38, ptr addrspace(3) %37, align 2, !dbg !15
|
48 |
+
%39 = or i32 %6, 2, !dbg !15
|
49 |
+
%40 = zext nneg i32 %39 to i64, !dbg !15
|
50 |
+
%41 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %40, !dbg !15
|
51 |
+
%42 = insertelement <1 x i16> undef, i16 %26, i64 0, !dbg !15
|
52 |
+
store <1 x i16> %42, ptr addrspace(3) %41, align 2, !dbg !15
|
53 |
+
%43 = or i32 %6, 3, !dbg !15
|
54 |
+
%44 = zext nneg i32 %43 to i64, !dbg !15
|
55 |
+
%45 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %44, !dbg !15
|
56 |
+
%46 = insertelement <1 x i16> undef, i16 %27, i64 0, !dbg !15
|
57 |
+
store <1 x i16> %46, ptr addrspace(3) %45, align 2, !dbg !15
|
58 |
+
%47 = or i32 %6, 4, !dbg !15
|
59 |
+
%48 = zext nneg i32 %47 to i64, !dbg !15
|
60 |
+
%49 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %48, !dbg !15
|
61 |
+
%50 = insertelement <1 x i16> undef, i16 %28, i64 0, !dbg !15
|
62 |
+
store <1 x i16> %50, ptr addrspace(3) %49, align 2, !dbg !15
|
63 |
+
%51 = or i32 %6, 5, !dbg !15
|
64 |
+
%52 = zext nneg i32 %51 to i64, !dbg !15
|
65 |
+
%53 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %52, !dbg !15
|
66 |
+
%54 = insertelement <1 x i16> undef, i16 %29, i64 0, !dbg !15
|
67 |
+
store <1 x i16> %54, ptr addrspace(3) %53, align 2, !dbg !15
|
68 |
+
%55 = or i32 %6, 6, !dbg !15
|
69 |
+
%56 = zext nneg i32 %55 to i64, !dbg !15
|
70 |
+
%57 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %56, !dbg !15
|
71 |
+
%58 = insertelement <1 x i16> undef, i16 %30, i64 0, !dbg !15
|
72 |
+
store <1 x i16> %58, ptr addrspace(3) %57, align 2, !dbg !15
|
73 |
+
%59 = or i32 %6, 7, !dbg !15
|
74 |
+
%60 = zext nneg i32 %59 to i64, !dbg !15
|
75 |
+
%61 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %60, !dbg !15
|
76 |
+
%62 = insertelement <1 x i16> undef, i16 %31, i64 0, !dbg !15
|
77 |
+
store <1 x i16> %62, ptr addrspace(3) %61, align 2, !dbg !15
|
78 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !15
|
79 |
+
%63 = zext nneg i32 %7 to i64, !dbg !15
|
80 |
+
%64 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %63, !dbg !15
|
81 |
+
%65 = load i16, ptr addrspace(3) %64, align 2, !dbg !15
|
82 |
+
%66 = or i32 %7, 1, !dbg !15
|
83 |
+
%67 = zext nneg i32 %66 to i64, !dbg !15
|
84 |
+
%68 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %67, !dbg !15
|
85 |
+
%69 = load i16, ptr addrspace(3) %68, align 2, !dbg !15
|
86 |
+
%70 = or i32 %7, 2, !dbg !15
|
87 |
+
%71 = zext nneg i32 %70 to i64, !dbg !15
|
88 |
+
%72 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %71, !dbg !15
|
89 |
+
%73 = load i16, ptr addrspace(3) %72, align 2, !dbg !15
|
90 |
+
%74 = or i32 %7, 3, !dbg !15
|
91 |
+
%75 = zext nneg i32 %74 to i64, !dbg !15
|
92 |
+
%76 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %75, !dbg !15
|
93 |
+
%77 = load i16, ptr addrspace(3) %76, align 2, !dbg !15
|
94 |
+
%78 = zext nneg i32 %8 to i64, !dbg !15
|
95 |
+
%79 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %78, !dbg !15
|
96 |
+
%80 = load i16, ptr addrspace(3) %79, align 2, !dbg !15
|
97 |
+
%81 = or i32 %7, 513, !dbg !15
|
98 |
+
%82 = zext nneg i32 %81 to i64, !dbg !15
|
99 |
+
%83 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %82, !dbg !15
|
100 |
+
%84 = load i16, ptr addrspace(3) %83, align 2, !dbg !15
|
101 |
+
%85 = or i32 %7, 514, !dbg !15
|
102 |
+
%86 = zext nneg i32 %85 to i64, !dbg !15
|
103 |
+
%87 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %86, !dbg !15
|
104 |
+
%88 = load i16, ptr addrspace(3) %87, align 2, !dbg !15
|
105 |
+
%89 = or i32 %7, 515, !dbg !15
|
106 |
+
%90 = zext nneg i32 %89 to i64, !dbg !15
|
107 |
+
%91 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %90, !dbg !15
|
108 |
+
%92 = load i16, ptr addrspace(3) %91, align 2, !dbg !15
|
109 |
+
%93 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %65) #2, !dbg !15
|
110 |
+
%94 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %69) #2, !dbg !15
|
111 |
+
%95 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %73) #2, !dbg !15
|
112 |
+
%96 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %77) #2, !dbg !15
|
113 |
+
%97 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %80) #2, !dbg !15
|
114 |
+
%98 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %84) #2, !dbg !15
|
115 |
+
%99 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %88) #2, !dbg !15
|
116 |
+
%100 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %92) #2, !dbg !15
|
117 |
+
%101 = sext i32 %12 to i64, !dbg !16
|
118 |
+
%102 = getelementptr float, ptr addrspace(1) %1, i64 %101, !dbg !16
|
119 |
+
%103 = sext i32 %13 to i64, !dbg !16
|
120 |
+
%104 = getelementptr float, ptr addrspace(1) %1, i64 %103, !dbg !16
|
121 |
+
%105 = bitcast float %93 to i32, !dbg !17
|
122 |
+
%106 = bitcast float %94 to i32, !dbg !17
|
123 |
+
%107 = bitcast float %95 to i32, !dbg !17
|
124 |
+
%108 = bitcast float %96 to i32, !dbg !17
|
125 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %105, i32 %106, i32 %107, i32 %108, ptr addrspace(1) %102, i1 %15) #2, !dbg !17
|
126 |
+
%109 = bitcast float %97 to i32, !dbg !17
|
127 |
+
%110 = bitcast float %98 to i32, !dbg !17
|
128 |
+
%111 = bitcast float %99 to i32, !dbg !17
|
129 |
+
%112 = bitcast float %100 to i32, !dbg !17
|
130 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %109, i32 %110, i32 %111, i32 %112, ptr addrspace(1) %104, i1 %16) #2, !dbg !17
|
131 |
+
ret void, !dbg !18
|
132 |
+
}
|
133 |
+
|
134 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
135 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
136 |
+
|
137 |
+
; Function Attrs: convergent nocallback nounwind
|
138 |
+
declare void @llvm.nvvm.barrier0() #1
|
139 |
+
|
140 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
141 |
+
attributes #1 = { convergent nocallback nounwind }
|
142 |
+
attributes #2 = { nounwind }
|
143 |
+
|
144 |
+
!llvm.module.flags = !{!0}
|
145 |
+
!llvm.dbg.cu = !{!1}
|
146 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
147 |
+
|
148 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
149 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
150 |
+
!2 = !DIFile(filename: "cmxm2obucqff2z4vc55zcnscfuvur5s2b3e36dvgm57qobanlpho.py", directory: "/tmp/torchinductor_root/mx")
|
151 |
+
!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
|
152 |
+
!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
|
153 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
154 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
155 |
+
!7 = !{}
|
156 |
+
!8 = !DILocation(line: 21, column: 36, scope: !5)
|
157 |
+
!9 = !DILocation(line: 20, column: 28, scope: !5)
|
158 |
+
!10 = !DILocation(line: 20, column: 33, scope: !5)
|
159 |
+
!11 = !DILocation(line: 21, column: 23, scope: !5)
|
160 |
+
!12 = !DILocation(line: 22, column: 21, scope: !5)
|
161 |
+
!13 = !DILocation(line: 24, column: 30, scope: !5)
|
162 |
+
!14 = !DILocation(line: 24, column: 35, scope: !5)
|
163 |
+
!15 = !DILocation(line: 24, column: 45, scope: !5)
|
164 |
+
!16 = !DILocation(line: 26, column: 25, scope: !5)
|
165 |
+
!17 = !DILocation(line: 26, column: 36, scope: !5)
|
166 |
+
!18 = !DILocation(line: 26, column: 4, scope: !5)
|
.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ptx
ADDED
@@ -0,0 +1,342 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2de
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2de(
|
13 |
+
.param .u64 triton__0d1d2de_param_0,
|
14 |
+
.param .u64 triton__0d1d2de_param_1,
|
15 |
+
.param .u32 triton__0d1d2de_param_2
|
16 |
+
)
|
17 |
+
.maxntid 128, 1, 1
|
18 |
+
{
|
19 |
+
.reg .pred %p<4>;
|
20 |
+
.reg .b16 %rs<9>;
|
21 |
+
.reg .b32 %r<38>;
|
22 |
+
.reg .b64 %rd<13>;
|
23 |
+
.loc 1 18 0
|
24 |
+
$L__func_begin0:
|
25 |
+
.loc 1 18 0
|
26 |
+
|
27 |
+
ld.param.u64 %rd4, [triton__0d1d2de_param_0];
|
28 |
+
ld.param.u64 %rd5, [triton__0d1d2de_param_1];
|
29 |
+
$L__tmp0:
|
30 |
+
.loc 1 21 36
|
31 |
+
mov.u32 %r22, %tid.x;
|
32 |
+
and.b32 %r23, %r22, 127;
|
33 |
+
shl.b32 %r24, %r23, 3;
|
34 |
+
shl.b32 %r25, %r23, 2;
|
35 |
+
.loc 1 20 28
|
36 |
+
mov.u32 %r1, %ctaid.x;
|
37 |
+
.loc 1 20 33
|
38 |
+
shl.b32 %r26, %r1, 10;
|
39 |
+
.loc 1 21 23
|
40 |
+
or.b32 %r27, %r26, %r24;
|
41 |
+
or.b32 %r28, %r26, %r25;
|
42 |
+
or.b32 %r29, %r28, 512;
|
43 |
+
.loc 1 22 21
|
44 |
+
setp.lt.s32 %p1, %r27, 12865792;
|
45 |
+
setp.lt.s32 %p2, %r28, 12865792;
|
46 |
+
setp.lt.s32 %p3, %r29, 12865792;
|
47 |
+
.loc 1 24 30
|
48 |
+
mul.wide.s32 %rd6, %r27, 2;
|
49 |
+
add.s64 %rd1, %rd4, %rd6;
|
50 |
+
.loc 1 24 35
|
51 |
+
mov.u32 %r2, 0x0;
|
52 |
+
mov.u32 %r3, 0x0;
|
53 |
+
mov.u32 %r4, 0x0;
|
54 |
+
mov.u32 %r5, 0x0;
|
55 |
+
@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
|
56 |
+
shr.u32 %r30, %r2, 16;
|
57 |
+
shr.u32 %r31, %r3, 16;
|
58 |
+
shr.u32 %r32, %r4, 16;
|
59 |
+
shr.u32 %r33, %r5, 16;
|
60 |
+
.loc 1 24 45
|
61 |
+
shl.b32 %r34, %r23, 4;
|
62 |
+
mov.u32 %r35, global_smem;
|
63 |
+
add.s32 %r36, %r35, %r34;
|
64 |
+
st.shared.u16 [%r36], %r2;
|
65 |
+
st.shared.u16 [%r36+2], %r30;
|
66 |
+
st.shared.u16 [%r36+4], %r3;
|
67 |
+
st.shared.u16 [%r36+6], %r31;
|
68 |
+
st.shared.u16 [%r36+8], %r4;
|
69 |
+
st.shared.u16 [%r36+10], %r32;
|
70 |
+
st.shared.u16 [%r36+12], %r5;
|
71 |
+
st.shared.u16 [%r36+14], %r33;
|
72 |
+
bar.sync 0;
|
73 |
+
add.s32 %r37, %r35, %r24;
|
74 |
+
ld.shared.u16 %rs1, [%r37];
|
75 |
+
ld.shared.u16 %rs2, [%r37+2];
|
76 |
+
ld.shared.u16 %rs3, [%r37+4];
|
77 |
+
ld.shared.u16 %rs4, [%r37+6];
|
78 |
+
ld.shared.u16 %rs5, [%r37+1024];
|
79 |
+
ld.shared.u16 %rs6, [%r37+1026];
|
80 |
+
ld.shared.u16 %rs7, [%r37+1028];
|
81 |
+
ld.shared.u16 %rs8, [%r37+1030];
|
82 |
+
cvt.f32.bf16 %r14, %rs1;
|
83 |
+
cvt.f32.bf16 %r15, %rs2;
|
84 |
+
cvt.f32.bf16 %r16, %rs3;
|
85 |
+
cvt.f32.bf16 %r17, %rs4;
|
86 |
+
cvt.f32.bf16 %r18, %rs5;
|
87 |
+
cvt.f32.bf16 %r19, %rs6;
|
88 |
+
cvt.f32.bf16 %r20, %rs7;
|
89 |
+
cvt.f32.bf16 %r21, %rs8;
|
90 |
+
.loc 1 26 25
|
91 |
+
mul.wide.s32 %rd7, %r28, 4;
|
92 |
+
add.s64 %rd2, %rd5, %rd7;
|
93 |
+
cvt.s64.s32 %rd8, %r26;
|
94 |
+
cvt.u64.u32 %rd9, %r25;
|
95 |
+
or.b64 %rd10, %rd8, %rd9;
|
96 |
+
shl.b64 %rd11, %rd10, 2;
|
97 |
+
add.s64 %rd12, %rd5, %rd11;
|
98 |
+
add.s64 %rd3, %rd12, 2048;
|
99 |
+
.loc 1 26 36
|
100 |
+
@%p2 st.global.v4.b32 [ %rd2 + 0 ], { %r14, %r15, %r16, %r17 };
|
101 |
+
@%p3 st.global.v4.b32 [ %rd3 + 0 ], { %r18, %r19, %r20, %r21 };
|
102 |
+
.loc 1 26 4
|
103 |
+
ret;
|
104 |
+
$L__tmp1:
|
105 |
+
$L__func_end0:
|
106 |
+
|
107 |
+
}
|
108 |
+
.file 1 "/tmp/torchinductor_root/mx/cmxm2obucqff2z4vc55zcnscfuvur5s2b3e36dvgm57qobanlpho.py"
|
109 |
+
.section .debug_abbrev
|
110 |
+
{
|
111 |
+
.b8 1
|
112 |
+
.b8 17
|
113 |
+
.b8 1
|
114 |
+
.b8 37
|
115 |
+
.b8 8
|
116 |
+
.b8 19
|
117 |
+
.b8 5
|
118 |
+
.b8 3
|
119 |
+
.b8 8
|
120 |
+
.b8 16
|
121 |
+
.b8 6
|
122 |
+
.b8 27
|
123 |
+
.b8 8
|
124 |
+
.b8 180
|
125 |
+
.b8 66
|
126 |
+
.b8 12
|
127 |
+
.b8 17
|
128 |
+
.b8 1
|
129 |
+
.b8 18
|
130 |
+
.b8 1
|
131 |
+
.b8 0
|
132 |
+
.b8 0
|
133 |
+
.b8 2
|
134 |
+
.b8 46
|
135 |
+
.b8 0
|
136 |
+
.b8 17
|
137 |
+
.b8 1
|
138 |
+
.b8 18
|
139 |
+
.b8 1
|
140 |
+
.b8 64
|
141 |
+
.b8 10
|
142 |
+
.b8 135
|
143 |
+
.b8 64
|
144 |
+
.b8 8
|
145 |
+
.b8 3
|
146 |
+
.b8 8
|
147 |
+
.b8 58
|
148 |
+
.b8 11
|
149 |
+
.b8 59
|
150 |
+
.b8 11
|
151 |
+
.b8 63
|
152 |
+
.b8 12
|
153 |
+
.b8 0
|
154 |
+
.b8 0
|
155 |
+
.b8 0
|
156 |
+
}
|
157 |
+
.section .debug_info
|
158 |
+
{
|
159 |
+
.b32 176
|
160 |
+
.b8 2
|
161 |
+
.b8 0
|
162 |
+
.b32 .debug_abbrev
|
163 |
+
.b8 8
|
164 |
+
.b8 1
|
165 |
+
.b8 116
|
166 |
+
.b8 114
|
167 |
+
.b8 105
|
168 |
+
.b8 116
|
169 |
+
.b8 111
|
170 |
+
.b8 110
|
171 |
+
.b8 0
|
172 |
+
.b8 2
|
173 |
+
.b8 0
|
174 |
+
.b8 99
|
175 |
+
.b8 109
|
176 |
+
.b8 120
|
177 |
+
.b8 109
|
178 |
+
.b8 50
|
179 |
+
.b8 111
|
180 |
+
.b8 98
|
181 |
+
.b8 117
|
182 |
+
.b8 99
|
183 |
+
.b8 113
|
184 |
+
.b8 102
|
185 |
+
.b8 102
|
186 |
+
.b8 50
|
187 |
+
.b8 122
|
188 |
+
.b8 52
|
189 |
+
.b8 118
|
190 |
+
.b8 99
|
191 |
+
.b8 53
|
192 |
+
.b8 53
|
193 |
+
.b8 122
|
194 |
+
.b8 99
|
195 |
+
.b8 110
|
196 |
+
.b8 115
|
197 |
+
.b8 99
|
198 |
+
.b8 102
|
199 |
+
.b8 117
|
200 |
+
.b8 118
|
201 |
+
.b8 117
|
202 |
+
.b8 114
|
203 |
+
.b8 53
|
204 |
+
.b8 115
|
205 |
+
.b8 50
|
206 |
+
.b8 98
|
207 |
+
.b8 51
|
208 |
+
.b8 101
|
209 |
+
.b8 51
|
210 |
+
.b8 54
|
211 |
+
.b8 100
|
212 |
+
.b8 118
|
213 |
+
.b8 103
|
214 |
+
.b8 109
|
215 |
+
.b8 53
|
216 |
+
.b8 55
|
217 |
+
.b8 113
|
218 |
+
.b8 111
|
219 |
+
.b8 98
|
220 |
+
.b8 97
|
221 |
+
.b8 110
|
222 |
+
.b8 108
|
223 |
+
.b8 112
|
224 |
+
.b8 104
|
225 |
+
.b8 111
|
226 |
+
.b8 46
|
227 |
+
.b8 112
|
228 |
+
.b8 121
|
229 |
+
.b8 0
|
230 |
+
.b32 .debug_line
|
231 |
+
.b8 47
|
232 |
+
.b8 116
|
233 |
+
.b8 109
|
234 |
+
.b8 112
|
235 |
+
.b8 47
|
236 |
+
.b8 116
|
237 |
+
.b8 111
|
238 |
+
.b8 114
|
239 |
+
.b8 99
|
240 |
+
.b8 104
|
241 |
+
.b8 105
|
242 |
+
.b8 110
|
243 |
+
.b8 100
|
244 |
+
.b8 117
|
245 |
+
.b8 99
|
246 |
+
.b8 116
|
247 |
+
.b8 111
|
248 |
+
.b8 114
|
249 |
+
.b8 95
|
250 |
+
.b8 114
|
251 |
+
.b8 111
|
252 |
+
.b8 111
|
253 |
+
.b8 116
|
254 |
+
.b8 47
|
255 |
+
.b8 109
|
256 |
+
.b8 120
|
257 |
+
.b8 0
|
258 |
+
.b8 1
|
259 |
+
.b64 $L__func_begin0
|
260 |
+
.b64 $L__func_end0
|
261 |
+
.b8 2
|
262 |
+
.b64 $L__func_begin0
|
263 |
+
.b64 $L__func_end0
|
264 |
+
.b8 1
|
265 |
+
.b8 156
|
266 |
+
.b8 116
|
267 |
+
.b8 114
|
268 |
+
.b8 105
|
269 |
+
.b8 116
|
270 |
+
.b8 111
|
271 |
+
.b8 110
|
272 |
+
.b8 95
|
273 |
+
.b8 95
|
274 |
+
.b8 48
|
275 |
+
.b8 100
|
276 |
+
.b8 49
|
277 |
+
.b8 100
|
278 |
+
.b8 50
|
279 |
+
.b8 100
|
280 |
+
.b8 101
|
281 |
+
.b8 0
|
282 |
+
.b8 116
|
283 |
+
.b8 114
|
284 |
+
.b8 105
|
285 |
+
.b8 116
|
286 |
+
.b8 111
|
287 |
+
.b8 110
|
288 |
+
.b8 95
|
289 |
+
.b8 95
|
290 |
+
.b8 48
|
291 |
+
.b8 100
|
292 |
+
.b8 49
|
293 |
+
.b8 100
|
294 |
+
.b8 50
|
295 |
+
.b8 100
|
296 |
+
.b8 101
|
297 |
+
.b8 0
|
298 |
+
.b8 1
|
299 |
+
.b8 18
|
300 |
+
.b8 1
|
301 |
+
.b8 0
|
302 |
+
}
|
303 |
+
.section .debug_pubnames
|
304 |
+
{
|
305 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
306 |
+
$L__pubNames_start0:
|
307 |
+
.b8 2
|
308 |
+
.b8 0
|
309 |
+
.b32 .debug_info
|
310 |
+
.b32 180
|
311 |
+
.b32 125
|
312 |
+
.b8 116
|
313 |
+
.b8 114
|
314 |
+
.b8 105
|
315 |
+
.b8 116
|
316 |
+
.b8 111
|
317 |
+
.b8 110
|
318 |
+
.b8 95
|
319 |
+
.b8 95
|
320 |
+
.b8 48
|
321 |
+
.b8 100
|
322 |
+
.b8 49
|
323 |
+
.b8 100
|
324 |
+
.b8 50
|
325 |
+
.b8 100
|
326 |
+
.b8 101
|
327 |
+
.b8 0
|
328 |
+
.b32 0
|
329 |
+
$L__pubNames_end0:
|
330 |
+
}
|
331 |
+
.section .debug_pubtypes
|
332 |
+
{
|
333 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
334 |
+
$L__pubTypes_start0:
|
335 |
+
.b8 2
|
336 |
+
.b8 0
|
337 |
+
.b32 .debug_info
|
338 |
+
.b32 180
|
339 |
+
.b32 0
|
340 |
+
$L__pubTypes_end0:
|
341 |
+
}
|
342 |
+
.section .debug_loc { }
|
.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ttgir
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
5 |
+
%cst = arith.constant dense<12865792> : tensor<1024xi32, #blocked>
|
6 |
+
%cst_0 = arith.constant dense<12865792> : tensor<1024xi32, #blocked1>
|
7 |
+
%c1024_i32 = arith.constant 1024 : i32
|
8 |
+
%0 = tt.get_program_id x : i32
|
9 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
10 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
|
11 |
+
%3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked1>
|
12 |
+
%4 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
|
13 |
+
%5 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked1>
|
14 |
+
%6 = arith.addi %4, %2 : tensor<1024xi32, #blocked>
|
15 |
+
%7 = arith.addi %5, %3 : tensor<1024xi32, #blocked1>
|
16 |
+
%8 = arith.cmpi slt, %6, %cst : tensor<1024xi32, #blocked>
|
17 |
+
%9 = arith.cmpi slt, %7, %cst_0 : tensor<1024xi32, #blocked1>
|
18 |
+
%10 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
|
19 |
+
%11 = tt.addptr %10, %6 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
|
20 |
+
%12 = tt.load %11, %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
|
21 |
+
%13 = triton_gpu.convert_layout %12 : (tensor<1024xbf16, #blocked>) -> tensor<1024xbf16, #blocked1>
|
22 |
+
%14 = arith.extf %13 : tensor<1024xbf16, #blocked1> to tensor<1024xf32, #blocked1>
|
23 |
+
%15 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked1>
|
24 |
+
%16 = tt.addptr %15, %7 : tensor<1024x!tt.ptr<f32, 1>, #blocked1>, tensor<1024xi32, #blocked1>
|
25 |
+
tt.store %16, %14, %9 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked1>
|
26 |
+
tt.return
|
27 |
+
}
|
28 |
+
}
|
.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ttir
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<12865792> : tensor<1024xi32>
|
4 |
+
%c1024_i32 = arith.constant 1024 : i32
|
5 |
+
%0 = tt.get_program_id x : i32
|
6 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
7 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
|
8 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32>
|
9 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32>
|
10 |
+
%5 = arith.cmpi slt, %4, %cst : tensor<1024xi32>
|
11 |
+
%6 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
|
12 |
+
%7 = tt.addptr %6, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
|
13 |
+
%8 = tt.load %7, %5 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
|
14 |
+
%9 = arith.extf %8 : tensor<1024xbf16> to tensor<1024xf32>
|
15 |
+
%10 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
|
16 |
+
%11 = tt.addptr %10, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
|
17 |
+
tt.store %11, %9, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32>
|
18 |
+
tt.return
|
19 |
+
}
|
20 |
+
}
|
.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.cubin
ADDED
Binary file (26 kB). View file
|
|
.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.llir
ADDED
@@ -0,0 +1,476 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
|
5 |
+
@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
|
6 |
+
@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
|
7 |
+
@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
|
8 |
+
@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
|
9 |
+
@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
|
10 |
+
@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
|
11 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
12 |
+
|
13 |
+
declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
|
14 |
+
|
15 |
+
define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
|
16 |
+
%8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
17 |
+
%9 = and i32 %8, 31, !dbg !10
|
18 |
+
%10 = lshr i32 %8, 5, !dbg !10
|
19 |
+
%11 = and i32 %10, 3, !dbg !10
|
20 |
+
%12 = lshr i32 %9, 1, !dbg !10
|
21 |
+
%13 = shl nuw nsw i32 %11, 4, !dbg !10
|
22 |
+
%14 = or i32 %13, %12, !dbg !10
|
23 |
+
%15 = and i32 %8, 63, !dbg !10
|
24 |
+
%16 = shl i32 %8, 2, !dbg !11
|
25 |
+
%17 = and i32 %16, 4, !dbg !11
|
26 |
+
%18 = and i32 %8, 7, !dbg !11
|
27 |
+
%19 = shl nuw nsw i32 %11, 2, !dbg !12
|
28 |
+
%20 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !13
|
29 |
+
%21 = shl i32 %20, 6, !dbg !14
|
30 |
+
%22 = or i32 %21, %14, !dbg !15
|
31 |
+
%23 = or i32 %21, %15, !dbg !15
|
32 |
+
%24 = sext i32 %22 to i64, !dbg !16
|
33 |
+
%25 = getelementptr i64, ptr addrspace(1) %0, i64 %24, !dbg !16
|
34 |
+
%26 = sext i32 %23 to i64, !dbg !16
|
35 |
+
%27 = getelementptr i64, ptr addrspace(1) %0, i64 %26, !dbg !16
|
36 |
+
%28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
|
37 |
+
%29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
|
38 |
+
%30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
|
39 |
+
%31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
|
40 |
+
%32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %27, i1 true) #6, !dbg !17
|
41 |
+
%33 = srem i32 %22, 512, !dbg !18
|
42 |
+
%34 = shl nsw i32 %33, 8, !dbg !19
|
43 |
+
%35 = add i64 %32, 50257, !dbg !20
|
44 |
+
%36 = icmp slt i64 %28, 0, !dbg !21
|
45 |
+
%37 = icmp slt i64 %32, 0, !dbg !21
|
46 |
+
%38 = select i1 %37, i64 %35, i64 %32, !dbg !22
|
47 |
+
%39 = icmp ugt i64 %38, 50256, !dbg !23
|
48 |
+
%40 = shl i64 %28, 8, !dbg !24
|
49 |
+
%41 = add i64 %40, 12865792, !dbg !24
|
50 |
+
%42 = select i1 %36, i64 %41, i64 %40, !dbg !24
|
51 |
+
%43 = getelementptr float, ptr addrspace(1) %1, i64 %42
|
52 |
+
br label %44, !dbg !12
|
53 |
+
|
54 |
+
44: ; preds = %7, %76
|
55 |
+
%45 = phi float [ 0.000000e+00, %7 ], [ %96, %76 ]
|
56 |
+
%46 = phi float [ 0.000000e+00, %7 ], [ %97, %76 ]
|
57 |
+
%47 = phi float [ 0.000000e+00, %7 ], [ %98, %76 ]
|
58 |
+
%48 = phi float [ 0.000000e+00, %7 ], [ %99, %76 ]
|
59 |
+
%49 = phi float [ 0.000000e+00, %7 ], [ %100, %76 ]
|
60 |
+
%50 = phi float [ 0.000000e+00, %7 ], [ %101, %76 ]
|
61 |
+
%51 = phi float [ 0.000000e+00, %7 ], [ %102, %76 ]
|
62 |
+
%52 = phi float [ 0.000000e+00, %7 ], [ %103, %76 ]
|
63 |
+
%53 = phi float [ 0.000000e+00, %7 ], [ %120, %76 ]
|
64 |
+
%54 = phi float [ 0.000000e+00, %7 ], [ %121, %76 ]
|
65 |
+
%55 = phi float [ 0.000000e+00, %7 ], [ %122, %76 ]
|
66 |
+
%56 = phi float [ 0.000000e+00, %7 ], [ %123, %76 ]
|
67 |
+
%57 = phi float [ 0.000000e+00, %7 ], [ %108, %76 ]
|
68 |
+
%58 = phi float [ 0.000000e+00, %7 ], [ %109, %76 ]
|
69 |
+
%59 = phi float [ 0.000000e+00, %7 ], [ %110, %76 ]
|
70 |
+
%60 = phi float [ 0.000000e+00, %7 ], [ %111, %76 ]
|
71 |
+
%61 = phi i32 [ 0, %7 ], [ %124, %76 ]
|
72 |
+
%62 = or i32 %61, %17, !dbg !25
|
73 |
+
%63 = add i32 %62, %34, !dbg !26
|
74 |
+
%64 = sext i32 %63 to i64, !dbg !27
|
75 |
+
%65 = getelementptr float, ptr addrspace(1) %2, i64 %64, !dbg !27
|
76 |
+
%66 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %65, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
|
77 |
+
%67 = extractvalue { i32, i32, i32, i32 } %66, 0, !dbg !28
|
78 |
+
%68 = extractvalue { i32, i32, i32, i32 } %66, 1, !dbg !28
|
79 |
+
%69 = extractvalue { i32, i32, i32, i32 } %66, 2, !dbg !28
|
80 |
+
%70 = extractvalue { i32, i32, i32, i32 } %66, 3, !dbg !28
|
81 |
+
%71 = bitcast i32 %67 to float, !dbg !28
|
82 |
+
%72 = bitcast i32 %68 to float, !dbg !28
|
83 |
+
%73 = bitcast i32 %69 to float, !dbg !28
|
84 |
+
%74 = bitcast i32 %70 to float, !dbg !28
|
85 |
+
br i1 %39, label %75, label %76, !dbg !29
|
86 |
+
|
87 |
+
75: ; preds = %44
|
88 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !29
|
89 |
+
br label %76, !dbg !29
|
90 |
+
|
91 |
+
76: ; preds = %75, %44
|
92 |
+
%77 = zext nneg i32 %62 to i64, !dbg !30
|
93 |
+
%78 = getelementptr float, ptr addrspace(1) %43, i64 %77, !dbg !31
|
94 |
+
%79 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %78, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
|
95 |
+
%80 = extractvalue { i32, i32, i32, i32 } %79, 0, !dbg !32
|
96 |
+
%81 = extractvalue { i32, i32, i32, i32 } %79, 1, !dbg !32
|
97 |
+
%82 = extractvalue { i32, i32, i32, i32 } %79, 2, !dbg !32
|
98 |
+
%83 = extractvalue { i32, i32, i32, i32 } %79, 3, !dbg !32
|
99 |
+
%84 = bitcast i32 %80 to float, !dbg !32
|
100 |
+
%85 = bitcast i32 %81 to float, !dbg !32
|
101 |
+
%86 = bitcast i32 %82 to float, !dbg !32
|
102 |
+
%87 = bitcast i32 %83 to float, !dbg !32
|
103 |
+
%88 = fadd float %71, %84, !dbg !33
|
104 |
+
%89 = fadd float %72, %85, !dbg !33
|
105 |
+
%90 = fadd float %73, %86, !dbg !33
|
106 |
+
%91 = fadd float %74, %87, !dbg !33
|
107 |
+
%92 = fsub float %88, %57, !dbg !34
|
108 |
+
%93 = fsub float %89, %58, !dbg !34
|
109 |
+
%94 = fsub float %90, %59, !dbg !34
|
110 |
+
%95 = fsub float %91, %60, !dbg !34
|
111 |
+
%96 = fadd float %45, 1.000000e+00, !dbg !38
|
112 |
+
%97 = fadd float %46, 1.000000e+00, !dbg !38
|
113 |
+
%98 = fadd float %47, 1.000000e+00, !dbg !38
|
114 |
+
%99 = fadd float %48, 1.000000e+00, !dbg !38
|
115 |
+
%100 = fadd float %49, 1.000000e+00, !dbg !38
|
116 |
+
%101 = fadd float %50, 1.000000e+00, !dbg !38
|
117 |
+
%102 = fadd float %51, 1.000000e+00, !dbg !38
|
118 |
+
%103 = fadd float %52, 1.000000e+00, !dbg !38
|
119 |
+
%104 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %92, float %96) #6, !dbg !39
|
120 |
+
%105 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %93, float %97) #6, !dbg !39
|
121 |
+
%106 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %94, float %98) #6, !dbg !39
|
122 |
+
%107 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %95, float %99) #6, !dbg !39
|
123 |
+
%108 = fadd float %57, %104, !dbg !40
|
124 |
+
%109 = fadd float %58, %105, !dbg !40
|
125 |
+
%110 = fadd float %59, %106, !dbg !40
|
126 |
+
%111 = fadd float %60, %107, !dbg !40
|
127 |
+
%112 = fsub float %88, %108, !dbg !41
|
128 |
+
%113 = fsub float %89, %109, !dbg !41
|
129 |
+
%114 = fsub float %90, %110, !dbg !41
|
130 |
+
%115 = fsub float %91, %111, !dbg !41
|
131 |
+
%116 = fmul float %92, %112, !dbg !42
|
132 |
+
%117 = fmul float %93, %113, !dbg !42
|
133 |
+
%118 = fmul float %94, %114, !dbg !42
|
134 |
+
%119 = fmul float %95, %115, !dbg !42
|
135 |
+
%120 = fadd float %53, %116, !dbg !43
|
136 |
+
%121 = fadd float %54, %117, !dbg !43
|
137 |
+
%122 = fadd float %55, %118, !dbg !43
|
138 |
+
%123 = fadd float %56, %119, !dbg !43
|
139 |
+
%124 = add nuw nsw i32 %61, 8, !dbg !12
|
140 |
+
%125 = icmp ult i32 %61, 248, !dbg !12
|
141 |
+
br i1 %125, label %44, label %126, !dbg !12
|
142 |
+
|
143 |
+
126: ; preds = %76
|
144 |
+
%127 = lshr i32 %9, 3, !dbg !12
|
145 |
+
%128 = or i32 %19, %127, !dbg !12
|
146 |
+
%129 = mul nuw nsw i32 %128, 12, !dbg !12
|
147 |
+
%130 = add nuw nsw i32 %129, %18, !dbg !12
|
148 |
+
%131 = zext nneg i32 %130 to i64, !dbg !12
|
149 |
+
%132 = getelementptr float, ptr addrspace(3) @global_smem, i64 %131, !dbg !12
|
150 |
+
%133 = insertelement <1 x float> undef, float %100, i64 0, !dbg !12
|
151 |
+
store <1 x float> %133, ptr addrspace(3) %132, align 4, !dbg !12
|
152 |
+
%134 = or i32 %18, 192, !dbg !12
|
153 |
+
%135 = add nuw nsw i32 %134, %129, !dbg !12
|
154 |
+
%136 = zext nneg i32 %135 to i64, !dbg !12
|
155 |
+
%137 = getelementptr float, ptr addrspace(3) @global_smem, i64 %136, !dbg !12
|
156 |
+
%138 = insertelement <1 x float> undef, float %101, i64 0, !dbg !12
|
157 |
+
store <1 x float> %138, ptr addrspace(3) %137, align 4, !dbg !12
|
158 |
+
%139 = or i32 %18, 384, !dbg !12
|
159 |
+
%140 = add nuw nsw i32 %139, %129, !dbg !12
|
160 |
+
%141 = zext nneg i32 %140 to i64, !dbg !12
|
161 |
+
%142 = getelementptr float, ptr addrspace(3) @global_smem, i64 %141, !dbg !12
|
162 |
+
%143 = insertelement <1 x float> undef, float %102, i64 0, !dbg !12
|
163 |
+
store <1 x float> %143, ptr addrspace(3) %142, align 4, !dbg !12
|
164 |
+
%144 = or i32 %18, 576, !dbg !12
|
165 |
+
%145 = add nuw nsw i32 %144, %129, !dbg !12
|
166 |
+
%146 = zext nneg i32 %145 to i64, !dbg !12
|
167 |
+
%147 = getelementptr float, ptr addrspace(3) @global_smem, i64 %146, !dbg !12
|
168 |
+
%148 = insertelement <1 x float> undef, float %103, i64 0, !dbg !12
|
169 |
+
store <1 x float> %148, ptr addrspace(3) %147, align 4, !dbg !12
|
170 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !12
|
171 |
+
%149 = mul nuw nsw i32 %14, 12, !dbg !12
|
172 |
+
%150 = add nuw nsw i32 %149, %17, !dbg !12
|
173 |
+
%151 = zext nneg i32 %150 to i64, !dbg !12
|
174 |
+
%152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !12
|
175 |
+
%153 = load float, ptr addrspace(3) %152, align 16, !dbg !12
|
176 |
+
%154 = getelementptr inbounds <4 x float>, ptr addrspace(3) %152, i64 0, i64 1, !dbg !12
|
177 |
+
%155 = load float, ptr addrspace(3) %154, align 4, !dbg !12
|
178 |
+
%156 = getelementptr inbounds <4 x float>, ptr addrspace(3) %152, i64 0, i64 2, !dbg !12
|
179 |
+
%157 = load float, ptr addrspace(3) %156, align 8, !dbg !12
|
180 |
+
%158 = getelementptr inbounds <4 x float>, ptr addrspace(3) %152, i64 0, i64 3, !dbg !12
|
181 |
+
%159 = load float, ptr addrspace(3) %158, align 4, !dbg !12
|
182 |
+
%160 = fsub float %109, %108, !dbg !44
|
183 |
+
%161 = fadd float %153, %155, !dbg !48
|
184 |
+
%162 = fcmp oeq float %161, 0.000000e+00, !dbg !49
|
185 |
+
%163 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %155, float %161) #6, !dbg !50
|
186 |
+
%164 = select i1 %162, float 0.000000e+00, float %163, !dbg !51
|
187 |
+
%165 = fmul float %160, %164, !dbg !52
|
188 |
+
%166 = fadd float %108, %165, !dbg !53
|
189 |
+
%167 = fadd float %120, %121, !dbg !54
|
190 |
+
%168 = fmul float %160, %160, !dbg !55
|
191 |
+
%169 = fmul float %168, %153, !dbg !56
|
192 |
+
%170 = fmul float %169, %164, !dbg !57
|
193 |
+
%171 = fadd float %167, %170, !dbg !58
|
194 |
+
%172 = fsub float %110, %166, !dbg !44
|
195 |
+
%173 = fadd float %157, %161, !dbg !48
|
196 |
+
%174 = fcmp oeq float %173, 0.000000e+00, !dbg !49
|
197 |
+
%175 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %157, float %173) #6, !dbg !50
|
198 |
+
%176 = select i1 %174, float 0.000000e+00, float %175, !dbg !51
|
199 |
+
%177 = fmul float %176, %172, !dbg !52
|
200 |
+
%178 = fadd float %166, %177, !dbg !53
|
201 |
+
%179 = fadd float %122, %171, !dbg !54
|
202 |
+
%180 = fmul float %172, %172, !dbg !55
|
203 |
+
%181 = fmul float %161, %180, !dbg !56
|
204 |
+
%182 = fmul float %176, %181, !dbg !57
|
205 |
+
%183 = fadd float %179, %182, !dbg !58
|
206 |
+
%184 = fsub float %111, %178, !dbg !44
|
207 |
+
%185 = fadd float %159, %173, !dbg !48
|
208 |
+
%186 = fcmp oeq float %185, 0.000000e+00, !dbg !49
|
209 |
+
%187 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %159, float %185) #6, !dbg !50
|
210 |
+
%188 = select i1 %186, float 0.000000e+00, float %187, !dbg !51
|
211 |
+
%189 = fmul float %188, %184, !dbg !52
|
212 |
+
%190 = fadd float %178, %189, !dbg !53
|
213 |
+
%191 = fadd float %123, %183, !dbg !54
|
214 |
+
%192 = fmul float %184, %184, !dbg !55
|
215 |
+
%193 = fmul float %173, %192, !dbg !56
|
216 |
+
%194 = fmul float %188, %193, !dbg !57
|
217 |
+
%195 = fadd float %191, %194, !dbg !58
|
218 |
+
%196 = bitcast float %190 to i32, !dbg !59
|
219 |
+
%197 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 1, i32 31), !dbg !59
|
220 |
+
%198 = bitcast i32 %197 to float, !dbg !59
|
221 |
+
%199 = bitcast float %195 to i32, !dbg !59
|
222 |
+
%200 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %199, i32 1, i32 31), !dbg !59
|
223 |
+
%201 = bitcast i32 %200 to float, !dbg !59
|
224 |
+
%202 = bitcast float %185 to i32, !dbg !59
|
225 |
+
%203 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %202, i32 1, i32 31), !dbg !59
|
226 |
+
%204 = bitcast i32 %203 to float, !dbg !59
|
227 |
+
%205 = fsub float %198, %190, !dbg !44
|
228 |
+
%206 = fadd float %185, %204, !dbg !48
|
229 |
+
%207 = fcmp oeq float %206, 0.000000e+00, !dbg !49
|
230 |
+
%208 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %204, float %206) #6, !dbg !50
|
231 |
+
%209 = select i1 %207, float 0.000000e+00, float %208, !dbg !51
|
232 |
+
%210 = fmul float %209, %205, !dbg !52
|
233 |
+
%211 = fadd float %190, %210, !dbg !53
|
234 |
+
%212 = fadd float %195, %201, !dbg !54
|
235 |
+
%213 = fmul float %205, %205, !dbg !55
|
236 |
+
%214 = fmul float %185, %213, !dbg !56
|
237 |
+
%215 = fmul float %209, %214, !dbg !57
|
238 |
+
%216 = fadd float %212, %215, !dbg !58
|
239 |
+
%217 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %216, float 2.560000e+02) #6, !dbg !61
|
240 |
+
%218 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %216, float 2.560000e+02) #6, !dbg !61
|
241 |
+
%219 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %216, float 2.560000e+02) #6, !dbg !61
|
242 |
+
%220 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %216, float 2.560000e+02) #6, !dbg !61
|
243 |
+
%221 = fadd float %217, 0x3EE4F8B580000000, !dbg !62
|
244 |
+
%222 = shl i32 %22, 8, !dbg !63
|
245 |
+
br label %223, !dbg !64
|
246 |
+
|
247 |
+
223: ; preds = %126, %__nv_rsqrtf.exit
|
248 |
+
%224 = phi i32 [ 0, %126 ], [ %298, %__nv_rsqrtf.exit ]
|
249 |
+
%225 = or i32 %224, %17, !dbg !65
|
250 |
+
%226 = add i32 %225, %34, !dbg !66
|
251 |
+
%227 = sext i32 %226 to i64, !dbg !67
|
252 |
+
%228 = getelementptr float, ptr addrspace(1) %2, i64 %227, !dbg !67
|
253 |
+
%229 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %228, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
|
254 |
+
%230 = extractvalue { i32, i32, i32, i32 } %229, 0, !dbg !68
|
255 |
+
%231 = extractvalue { i32, i32, i32, i32 } %229, 1, !dbg !68
|
256 |
+
%232 = extractvalue { i32, i32, i32, i32 } %229, 2, !dbg !68
|
257 |
+
%233 = extractvalue { i32, i32, i32, i32 } %229, 3, !dbg !68
|
258 |
+
%234 = bitcast i32 %230 to float, !dbg !68
|
259 |
+
%235 = bitcast i32 %231 to float, !dbg !68
|
260 |
+
%236 = bitcast i32 %232 to float, !dbg !68
|
261 |
+
%237 = bitcast i32 %233 to float, !dbg !68
|
262 |
+
%238 = zext nneg i32 %225 to i64, !dbg !69
|
263 |
+
%239 = getelementptr float, ptr addrspace(1) %3, i64 %238, !dbg !69
|
264 |
+
%240 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %239, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !70
|
265 |
+
%241 = extractvalue { i32, i32, i32, i32 } %240, 0, !dbg !70
|
266 |
+
%242 = extractvalue { i32, i32, i32, i32 } %240, 1, !dbg !70
|
267 |
+
%243 = extractvalue { i32, i32, i32, i32 } %240, 2, !dbg !70
|
268 |
+
%244 = extractvalue { i32, i32, i32, i32 } %240, 3, !dbg !70
|
269 |
+
%245 = bitcast i32 %241 to float, !dbg !70
|
270 |
+
%246 = bitcast i32 %242 to float, !dbg !70
|
271 |
+
%247 = bitcast i32 %243 to float, !dbg !70
|
272 |
+
%248 = bitcast i32 %244 to float, !dbg !70
|
273 |
+
br i1 %39, label %249, label %250, !dbg !71
|
274 |
+
|
275 |
+
249: ; preds = %223
|
276 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !71
|
277 |
+
br label %250, !dbg !71
|
278 |
+
|
279 |
+
250: ; preds = %249, %223
|
280 |
+
%251 = getelementptr float, ptr addrspace(1) %43, i64 %238, !dbg !72
|
281 |
+
%252 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %251, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
|
282 |
+
%253 = extractvalue { i32, i32, i32, i32 } %252, 0, !dbg !73
|
283 |
+
%254 = extractvalue { i32, i32, i32, i32 } %252, 1, !dbg !73
|
284 |
+
%255 = extractvalue { i32, i32, i32, i32 } %252, 2, !dbg !73
|
285 |
+
%256 = extractvalue { i32, i32, i32, i32 } %252, 3, !dbg !73
|
286 |
+
%257 = bitcast i32 %253 to float, !dbg !73
|
287 |
+
%258 = bitcast i32 %254 to float, !dbg !73
|
288 |
+
%259 = bitcast i32 %255 to float, !dbg !73
|
289 |
+
%260 = bitcast i32 %256 to float, !dbg !73
|
290 |
+
%261 = fadd float %234, %257, !dbg !74
|
291 |
+
%262 = fadd float %235, %258, !dbg !74
|
292 |
+
%263 = fadd float %236, %259, !dbg !74
|
293 |
+
%264 = fadd float %237, %260, !dbg !74
|
294 |
+
%265 = fsub float %261, %211, !dbg !75
|
295 |
+
%266 = fsub float %262, %211, !dbg !75
|
296 |
+
%267 = fsub float %263, %211, !dbg !75
|
297 |
+
%268 = fsub float %264, %211, !dbg !75
|
298 |
+
%269 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
299 |
+
%.not.i = icmp eq i32 %269, 0, !dbg !76
|
300 |
+
br i1 %.not.i, label %272, label %270, !dbg !76
|
301 |
+
|
302 |
+
270: ; preds = %250
|
303 |
+
%271 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %221), !dbg !76
|
304 |
+
br label %__nv_rsqrtf.exit, !dbg !76
|
305 |
+
|
306 |
+
272: ; preds = %250
|
307 |
+
%273 = tail call float @llvm.nvvm.rsqrt.approx.f(float %221), !dbg !76
|
308 |
+
br label %__nv_rsqrtf.exit, !dbg !76
|
309 |
+
|
310 |
+
__nv_rsqrtf.exit: ; preds = %270, %272
|
311 |
+
%.0.i = phi float [ %271, %270 ], [ %273, %272 ], !dbg !76
|
312 |
+
%274 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
313 |
+
%275 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
314 |
+
%276 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
315 |
+
%277 = fmul float %265, %.0.i, !dbg !77
|
316 |
+
%278 = fmul float %266, %.0.i, !dbg !77
|
317 |
+
%279 = fmul float %267, %.0.i, !dbg !77
|
318 |
+
%280 = fmul float %268, %.0.i, !dbg !77
|
319 |
+
%281 = fmul float %277, %245, !dbg !78
|
320 |
+
%282 = fmul float %278, %246, !dbg !78
|
321 |
+
%283 = fmul float %279, %247, !dbg !78
|
322 |
+
%284 = fmul float %280, %248, !dbg !78
|
323 |
+
%285 = add i32 %225, %222, !dbg !79
|
324 |
+
%286 = sext i32 %285 to i64, !dbg !80
|
325 |
+
%287 = getelementptr i16, ptr addrspace(1) %4, i64 %286, !dbg !80
|
326 |
+
%288 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %281) #6, !dbg !81
|
327 |
+
%289 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %282) #6, !dbg !81
|
328 |
+
%290 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %283) #6, !dbg !81
|
329 |
+
%291 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %284) #6, !dbg !81
|
330 |
+
%292 = insertelement <2 x i16> undef, i16 %288, i64 0, !dbg !81
|
331 |
+
%293 = insertelement <2 x i16> %292, i16 %289, i64 1, !dbg !81
|
332 |
+
%294 = bitcast <2 x i16> %293 to i32, !dbg !81
|
333 |
+
%295 = insertelement <2 x i16> undef, i16 %290, i64 0, !dbg !81
|
334 |
+
%296 = insertelement <2 x i16> %295, i16 %291, i64 1, !dbg !81
|
335 |
+
%297 = bitcast <2 x i16> %296 to i32, !dbg !81
|
336 |
+
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %294, i32 %297, ptr addrspace(1) %287, i1 true) #6, !dbg !81
|
337 |
+
%298 = add nuw nsw i32 %224, 8, !dbg !64
|
338 |
+
%299 = icmp ult i32 %224, 248, !dbg !64
|
339 |
+
br i1 %299, label %223, label %300, !dbg !64
|
340 |
+
|
341 |
+
300: ; preds = %__nv_rsqrtf.exit
|
342 |
+
ret void, !dbg !82
|
343 |
+
}
|
344 |
+
|
345 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
346 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
347 |
+
|
348 |
+
; Function Attrs: convergent nocallback nounwind
|
349 |
+
declare void @llvm.nvvm.barrier0() #1
|
350 |
+
|
351 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
352 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
|
353 |
+
|
354 |
+
; Function Attrs: alwaysinline nounwind
|
355 |
+
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
|
356 |
+
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
|
357 |
+
%.not = icmp eq i32 %1, 0
|
358 |
+
br i1 %.not, label %4, label %2
|
359 |
+
|
360 |
+
2: ; preds = %0
|
361 |
+
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
|
362 |
+
br label %6
|
363 |
+
|
364 |
+
4: ; preds = %0
|
365 |
+
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
|
366 |
+
br label %6
|
367 |
+
|
368 |
+
6: ; preds = %4, %2
|
369 |
+
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
|
370 |
+
ret float %.0
|
371 |
+
}
|
372 |
+
|
373 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
|
374 |
+
|
375 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
376 |
+
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
|
377 |
+
|
378 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
379 |
+
declare float @llvm.nvvm.rsqrt.approx.f(float) #5
|
380 |
+
|
381 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
382 |
+
attributes #1 = { convergent nocallback nounwind }
|
383 |
+
attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
384 |
+
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
385 |
+
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
386 |
+
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
387 |
+
attributes #6 = { nounwind }
|
388 |
+
|
389 |
+
!llvm.module.flags = !{!0, !1}
|
390 |
+
!llvm.dbg.cu = !{!2}
|
391 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
392 |
+
!llvm.ident = !{!6}
|
393 |
+
|
394 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
395 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
396 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
397 |
+
!3 = !DIFile(filename: "cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py", directory: "/tmp/torchinductor_root/gx")
|
398 |
+
!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
|
399 |
+
!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 128}
|
400 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
401 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
402 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
403 |
+
!9 = !{}
|
404 |
+
!10 = !DILocation(line: 22, column: 44, scope: !7)
|
405 |
+
!11 = !DILocation(line: 24, column: 33, scope: !7)
|
406 |
+
!12 = !DILocation(line: 31, column: 36, scope: !7)
|
407 |
+
!13 = !DILocation(line: 21, column: 28, scope: !7)
|
408 |
+
!14 = !DILocation(line: 21, column: 33, scope: !7)
|
409 |
+
!15 = !DILocation(line: 22, column: 23, scope: !7)
|
410 |
+
!16 = !DILocation(line: 26, column: 30, scope: !7)
|
411 |
+
!17 = !DILocation(line: 26, column: 35, scope: !7)
|
412 |
+
!18 = !DILocation(line: 27, column: 18, scope: !7)
|
413 |
+
!19 = !DILocation(line: 35, column: 44, scope: !7)
|
414 |
+
!20 = !DILocation(line: 36, column: 22, scope: !7)
|
415 |
+
!21 = !DILocation(line: 37, column: 22, scope: !7)
|
416 |
+
!22 = !DILocation(line: 38, column: 36, scope: !7)
|
417 |
+
!23 = !DILocation(line: 39, column: 40, scope: !7)
|
418 |
+
!24 = !DILocation(line: 40, column: 44, scope: !7)
|
419 |
+
!25 = !DILocation(line: 32, column: 27, scope: !7)
|
420 |
+
!26 = !DILocation(line: 35, column: 40, scope: !7)
|
421 |
+
!27 = !DILocation(line: 35, column: 34, scope: !7)
|
422 |
+
!28 = !DILocation(line: 35, column: 50, scope: !7)
|
423 |
+
!29 = !DILocation(line: 39, column: 55, scope: !7)
|
424 |
+
!30 = !DILocation(line: 40, column: 40, scope: !7)
|
425 |
+
!31 = !DILocation(line: 40, column: 34, scope: !7)
|
426 |
+
!32 = !DILocation(line: 40, column: 52, scope: !7)
|
427 |
+
!33 = !DILocation(line: 41, column: 22, scope: !7)
|
428 |
+
!34 = !DILocation(line: 96, column: 20, scope: !35, inlinedAt: !37)
|
429 |
+
!35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
|
430 |
+
!36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
431 |
+
!37 = !DILocation(line: 44, column: 38, scope: !35)
|
432 |
+
!38 = !DILocation(line: 97, column: 26, scope: !35, inlinedAt: !37)
|
433 |
+
!39 = !DILocation(line: 98, column: 30, scope: !35, inlinedAt: !37)
|
434 |
+
!40 = !DILocation(line: 98, column: 22, scope: !35, inlinedAt: !37)
|
435 |
+
!41 = !DILocation(line: 101, column: 30, scope: !35, inlinedAt: !37)
|
436 |
+
!42 = !DILocation(line: 101, column: 22, scope: !35, inlinedAt: !37)
|
437 |
+
!43 = !DILocation(line: 47, column: 48, scope: !7)
|
438 |
+
!44 = !DILocation(line: 108, column: 21, scope: !45, inlinedAt: !46)
|
439 |
+
!45 = distinct !DILexicalBlockFile(scope: !35, file: !36, discriminator: 0)
|
440 |
+
!46 = !DILocation(line: 120, column: 46, scope: !45, inlinedAt: !47)
|
441 |
+
!47 = !DILocation(line: 50, column: 41, scope: !45)
|
442 |
+
!48 = !DILocation(line: 109, column: 28, scope: !45, inlinedAt: !46)
|
443 |
+
!49 = !DILocation(line: 110, column: 39, scope: !45, inlinedAt: !46)
|
444 |
+
!50 = !DILocation(line: 110, column: 60, scope: !45, inlinedAt: !46)
|
445 |
+
!51 = !DILocation(line: 110, column: 49, scope: !45, inlinedAt: !46)
|
446 |
+
!52 = !DILocation(line: 112, column: 25, scope: !45, inlinedAt: !46)
|
447 |
+
!53 = !DILocation(line: 112, column: 17, scope: !45, inlinedAt: !46)
|
448 |
+
!54 = !DILocation(line: 113, column: 15, scope: !45, inlinedAt: !46)
|
449 |
+
!55 = !DILocation(line: 113, column: 30, scope: !45, inlinedAt: !46)
|
450 |
+
!56 = !DILocation(line: 113, column: 38, scope: !45, inlinedAt: !46)
|
451 |
+
!57 = !DILocation(line: 113, column: 49, scope: !45, inlinedAt: !46)
|
452 |
+
!58 = !DILocation(line: 113, column: 22, scope: !45, inlinedAt: !46)
|
453 |
+
!59 = !DILocation(line: 120, column: 46, scope: !35, inlinedAt: !60)
|
454 |
+
!60 = !DILocation(line: 50, column: 41, scope: !35)
|
455 |
+
!61 = !DILocation(line: 69, column: 23, scope: !7)
|
456 |
+
!62 = !DILocation(line: 71, column: 24, scope: !7)
|
457 |
+
!63 = !DILocation(line: 76, column: 39, scope: !7)
|
458 |
+
!64 = !DILocation(line: 55, column: 36, scope: !7)
|
459 |
+
!65 = !DILocation(line: 56, column: 27, scope: !7)
|
460 |
+
!66 = !DILocation(line: 59, column: 41, scope: !7)
|
461 |
+
!67 = !DILocation(line: 59, column: 35, scope: !7)
|
462 |
+
!68 = !DILocation(line: 59, column: 51, scope: !7)
|
463 |
+
!69 = !DILocation(line: 60, column: 35, scope: !7)
|
464 |
+
!70 = !DILocation(line: 60, column: 40, scope: !7)
|
465 |
+
!71 = !DILocation(line: 64, column: 57, scope: !7)
|
466 |
+
!72 = !DILocation(line: 65, column: 35, scope: !7)
|
467 |
+
!73 = !DILocation(line: 65, column: 54, scope: !7)
|
468 |
+
!74 = !DILocation(line: 66, column: 24, scope: !7)
|
469 |
+
!75 = !DILocation(line: 67, column: 24, scope: !7)
|
470 |
+
!76 = !DILocation(line: 72, column: 30, scope: !7)
|
471 |
+
!77 = !DILocation(line: 73, column: 24, scope: !7)
|
472 |
+
!78 = !DILocation(line: 74, column: 24, scope: !7)
|
473 |
+
!79 = !DILocation(line: 76, column: 35, scope: !7)
|
474 |
+
!80 = !DILocation(line: 76, column: 29, scope: !7)
|
475 |
+
!81 = !DILocation(line: 76, column: 52, scope: !7)
|
476 |
+
!82 = !DILocation(line: 55, column: 4, scope: !7)
|