0-hero commited on
Commit
f9d5f95
·
verified ·
1 Parent(s): 00602c7

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .local/share/jupyter/nbextensions/help_panel/img/handle-v.png +0 -0
  2. .triton/dump/0471aff594c8c8b8715b81c529738739/triton_.llir +523 -0
  3. .triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ttgir +165 -0
  4. .triton/dump/0ef13ec90cf21db4d33a072ff09ec2d4/triton_.cubin +0 -0
  5. .triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ptx +764 -0
  6. .triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttgir +26 -0
  7. .triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttir +25 -0
  8. .triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.cubin +0 -0
  9. .triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttgir +60 -0
  10. .triton/dump/199215289adb100508718a5a762ba4d7/triton_.llir +184 -0
  11. .triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.cubin +0 -0
  12. .triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ttir +18 -0
  13. .triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.cubin +0 -0
  14. .triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.llir +332 -0
  15. .triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.ttir +25 -0
  16. .triton/dump/246118bec10f09cdce32d0be7c22b5ae/triton_.ptx +278 -0
  17. .triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.cubin +0 -0
  18. .triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.llir +162 -0
  19. .triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ptx +338 -0
  20. .triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ttgir +24 -0
  21. .triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ttir +18 -0
  22. .triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.cubin +0 -0
  23. .triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.llir +132 -0
  24. .triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ttgir +62 -0
  25. .triton/dump/415aac87553b7d064f52694fa7254686/triton_.cubin +0 -0
  26. .triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttgir +62 -0
  27. .triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.cubin +0 -0
  28. .triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.ptx +465 -0
  29. .triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.cubin +0 -0
  30. .triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.llir +424 -0
  31. .triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.ptx +921 -0
  32. .triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.ttgir +81 -0
  33. .triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.ttir +88 -0
  34. .triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.cubin +0 -0
  35. .triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.llir +55 -0
  36. .triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.ptx +297 -0
  37. .triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.ttgir +21 -0
  38. .triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.ttir +20 -0
  39. .triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.cubin +0 -0
  40. .triton/dump/884b5df35d2a25fd91308249e7657806/triton_.cubin +0 -0
  41. .triton/dump/884b5df35d2a25fd91308249e7657806/triton_.llir +48 -0
  42. .triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ptx +280 -0
  43. .triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ttgir +18 -0
  44. .triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ttir +17 -0
  45. .triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.llir +166 -0
  46. .triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ptx +342 -0
  47. .triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ttgir +28 -0
  48. .triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ttir +20 -0
  49. .triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.cubin +0 -0
  50. .triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.llir +476 -0
.local/share/jupyter/nbextensions/help_panel/img/handle-v.png ADDED
.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.llir ADDED
@@ -0,0 +1,523 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
5
+ @assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
6
+ @assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
7
+ @assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
8
+ @assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
9
+ @assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
10
+ @global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
11
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
12
+
13
+ declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
14
+
15
+ define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
16
+ %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
17
+ %10 = and i32 %9, 31, !dbg !10
18
+ %11 = lshr i32 %9, 5, !dbg !10
19
+ %12 = and i32 %11, 3, !dbg !10
20
+ %13 = lshr i32 %10, 1, !dbg !10
21
+ %14 = shl nuw nsw i32 %12, 4, !dbg !10
22
+ %15 = or i32 %14, %13, !dbg !10
23
+ %16 = and i32 %9, 63, !dbg !10
24
+ %17 = shl i32 %9, 2, !dbg !11
25
+ %18 = and i32 %17, 4, !dbg !11
26
+ %19 = and i32 %9, 7, !dbg !11
27
+ %20 = shl nuw nsw i32 %12, 2, !dbg !12
28
+ %21 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !13
29
+ %22 = shl i32 %21, 6, !dbg !14
30
+ %23 = or i32 %22, %15, !dbg !15
31
+ %24 = or i32 %22, %16, !dbg !15
32
+ %25 = sext i32 %23 to i64, !dbg !16
33
+ %26 = getelementptr i64, ptr addrspace(1) %0, i64 %25, !dbg !16
34
+ %27 = sext i32 %24 to i64, !dbg !16
35
+ %28 = getelementptr i64, ptr addrspace(1) %0, i64 %27, !dbg !16
36
+ %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
37
+ %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
38
+ %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
39
+ %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
40
+ %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %28, i1 true) #6, !dbg !17
41
+ %34 = srem i32 %23, 512, !dbg !18
42
+ %35 = shl nsw i32 %34, 8, !dbg !19
43
+ %36 = shl i32 %23, 8, !dbg !20
44
+ %37 = add i64 %33, 50257, !dbg !21
45
+ %38 = icmp slt i64 %29, 0, !dbg !22
46
+ %39 = icmp slt i64 %33, 0, !dbg !22
47
+ %40 = select i1 %39, i64 %37, i64 %33, !dbg !23
48
+ %41 = icmp ugt i64 %40, 50256, !dbg !24
49
+ %42 = shl i64 %29, 8, !dbg !25
50
+ %43 = add i64 %42, 12865792, !dbg !25
51
+ %44 = select i1 %38, i64 %43, i64 %42, !dbg !25
52
+ %45 = getelementptr float, ptr addrspace(1) %1, i64 %44
53
+ br label %46, !dbg !12
54
+
55
+ 46: ; preds = %8, %92
56
+ %47 = phi float [ 0.000000e+00, %8 ], [ %116, %92 ]
57
+ %48 = phi float [ 0.000000e+00, %8 ], [ %117, %92 ]
58
+ %49 = phi float [ 0.000000e+00, %8 ], [ %118, %92 ]
59
+ %50 = phi float [ 0.000000e+00, %8 ], [ %119, %92 ]
60
+ %51 = phi float [ 0.000000e+00, %8 ], [ %120, %92 ]
61
+ %52 = phi float [ 0.000000e+00, %8 ], [ %121, %92 ]
62
+ %53 = phi float [ 0.000000e+00, %8 ], [ %122, %92 ]
63
+ %54 = phi float [ 0.000000e+00, %8 ], [ %123, %92 ]
64
+ %55 = phi float [ 0.000000e+00, %8 ], [ %140, %92 ]
65
+ %56 = phi float [ 0.000000e+00, %8 ], [ %141, %92 ]
66
+ %57 = phi float [ 0.000000e+00, %8 ], [ %142, %92 ]
67
+ %58 = phi float [ 0.000000e+00, %8 ], [ %143, %92 ]
68
+ %59 = phi float [ 0.000000e+00, %8 ], [ %128, %92 ]
69
+ %60 = phi float [ 0.000000e+00, %8 ], [ %129, %92 ]
70
+ %61 = phi float [ 0.000000e+00, %8 ], [ %130, %92 ]
71
+ %62 = phi float [ 0.000000e+00, %8 ], [ %131, %92 ]
72
+ %63 = phi i32 [ 0, %8 ], [ %144, %92 ]
73
+ %64 = or i32 %63, %18, !dbg !26
74
+ %65 = add i32 %64, %35, !dbg !27
75
+ %66 = sext i32 %65 to i64, !dbg !28
76
+ %67 = getelementptr float, ptr addrspace(1) %2, i64 %66, !dbg !28
77
+ %68 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %67, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
78
+ %69 = extractvalue { i32, i32, i32, i32 } %68, 0, !dbg !29
79
+ %70 = extractvalue { i32, i32, i32, i32 } %68, 1, !dbg !29
80
+ %71 = extractvalue { i32, i32, i32, i32 } %68, 2, !dbg !29
81
+ %72 = extractvalue { i32, i32, i32, i32 } %68, 3, !dbg !29
82
+ %73 = bitcast i32 %69 to float, !dbg !29
83
+ %74 = bitcast i32 %70 to float, !dbg !29
84
+ %75 = bitcast i32 %71 to float, !dbg !29
85
+ %76 = bitcast i32 %72 to float, !dbg !29
86
+ %77 = add i32 %64, %36, !dbg !30
87
+ %78 = sext i32 %77 to i64, !dbg !31
88
+ %79 = getelementptr i16, ptr addrspace(1) %3, i64 %78, !dbg !31
89
+ %80 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %79, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
90
+ %81 = extractvalue { i32, i32 } %80, 0, !dbg !32
91
+ %82 = extractvalue { i32, i32 } %80, 1, !dbg !32
92
+ %83 = trunc i32 %81 to i16, !dbg !32
93
+ %extelt.offset3 = lshr i32 %81, 16, !dbg !32
94
+ %84 = trunc i32 %extelt.offset3 to i16, !dbg !32
95
+ %85 = trunc i32 %82 to i16, !dbg !32
96
+ %extelt.offset4 = lshr i32 %82, 16, !dbg !32
97
+ %86 = trunc i32 %extelt.offset4 to i16, !dbg !32
98
+ %87 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %83) #6, !dbg !33
99
+ %88 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %84) #6, !dbg !33
100
+ %89 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %85) #6, !dbg !33
101
+ %90 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %86) #6, !dbg !33
102
+ br i1 %41, label %91, label %92, !dbg !34
103
+
104
+ 91: ; preds = %46
105
+ tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !34
106
+ br label %92, !dbg !34
107
+
108
+ 92: ; preds = %91, %46
109
+ %93 = zext nneg i32 %64 to i64, !dbg !35
110
+ %94 = getelementptr float, ptr addrspace(1) %45, i64 %93, !dbg !36
111
+ %95 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %94, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37
112
+ %96 = extractvalue { i32, i32, i32, i32 } %95, 0, !dbg !37
113
+ %97 = extractvalue { i32, i32, i32, i32 } %95, 1, !dbg !37
114
+ %98 = extractvalue { i32, i32, i32, i32 } %95, 2, !dbg !37
115
+ %99 = extractvalue { i32, i32, i32, i32 } %95, 3, !dbg !37
116
+ %100 = bitcast i32 %96 to float, !dbg !37
117
+ %101 = bitcast i32 %97 to float, !dbg !37
118
+ %102 = bitcast i32 %98 to float, !dbg !37
119
+ %103 = bitcast i32 %99 to float, !dbg !37
120
+ %104 = fadd float %73, %100, !dbg !38
121
+ %105 = fadd float %74, %101, !dbg !38
122
+ %106 = fadd float %75, %102, !dbg !38
123
+ %107 = fadd float %76, %103, !dbg !38
124
+ %108 = fadd float %87, %104, !dbg !39
125
+ %109 = fadd float %88, %105, !dbg !39
126
+ %110 = fadd float %89, %106, !dbg !39
127
+ %111 = fadd float %90, %107, !dbg !39
128
+ %112 = fsub float %108, %59, !dbg !40
129
+ %113 = fsub float %109, %60, !dbg !40
130
+ %114 = fsub float %110, %61, !dbg !40
131
+ %115 = fsub float %111, %62, !dbg !40
132
+ %116 = fadd float %47, 1.000000e+00, !dbg !44
133
+ %117 = fadd float %48, 1.000000e+00, !dbg !44
134
+ %118 = fadd float %49, 1.000000e+00, !dbg !44
135
+ %119 = fadd float %50, 1.000000e+00, !dbg !44
136
+ %120 = fadd float %51, 1.000000e+00, !dbg !44
137
+ %121 = fadd float %52, 1.000000e+00, !dbg !44
138
+ %122 = fadd float %53, 1.000000e+00, !dbg !44
139
+ %123 = fadd float %54, 1.000000e+00, !dbg !44
140
+ %124 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %112, float %116) #6, !dbg !45
141
+ %125 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %113, float %117) #6, !dbg !45
142
+ %126 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %114, float %118) #6, !dbg !45
143
+ %127 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %115, float %119) #6, !dbg !45
144
+ %128 = fadd float %59, %124, !dbg !46
145
+ %129 = fadd float %60, %125, !dbg !46
146
+ %130 = fadd float %61, %126, !dbg !46
147
+ %131 = fadd float %62, %127, !dbg !46
148
+ %132 = fsub float %108, %128, !dbg !47
149
+ %133 = fsub float %109, %129, !dbg !47
150
+ %134 = fsub float %110, %130, !dbg !47
151
+ %135 = fsub float %111, %131, !dbg !47
152
+ %136 = fmul float %112, %132, !dbg !48
153
+ %137 = fmul float %113, %133, !dbg !48
154
+ %138 = fmul float %114, %134, !dbg !48
155
+ %139 = fmul float %115, %135, !dbg !48
156
+ %140 = fadd float %55, %136, !dbg !49
157
+ %141 = fadd float %56, %137, !dbg !49
158
+ %142 = fadd float %57, %138, !dbg !49
159
+ %143 = fadd float %58, %139, !dbg !49
160
+ %144 = add nuw nsw i32 %63, 8, !dbg !12
161
+ %145 = icmp ult i32 %63, 248, !dbg !12
162
+ br i1 %145, label %46, label %146, !dbg !12
163
+
164
+ 146: ; preds = %92
165
+ %147 = lshr i32 %10, 3, !dbg !12
166
+ %148 = or i32 %20, %147, !dbg !12
167
+ %149 = mul nuw nsw i32 %148, 12, !dbg !12
168
+ %150 = add nuw nsw i32 %149, %19, !dbg !12
169
+ %151 = zext nneg i32 %150 to i64, !dbg !12
170
+ %152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !12
171
+ %153 = insertelement <1 x float> undef, float %120, i64 0, !dbg !12
172
+ store <1 x float> %153, ptr addrspace(3) %152, align 4, !dbg !12
173
+ %154 = or i32 %19, 192, !dbg !12
174
+ %155 = add nuw nsw i32 %154, %149, !dbg !12
175
+ %156 = zext nneg i32 %155 to i64, !dbg !12
176
+ %157 = getelementptr float, ptr addrspace(3) @global_smem, i64 %156, !dbg !12
177
+ %158 = insertelement <1 x float> undef, float %121, i64 0, !dbg !12
178
+ store <1 x float> %158, ptr addrspace(3) %157, align 4, !dbg !12
179
+ %159 = or i32 %19, 384, !dbg !12
180
+ %160 = add nuw nsw i32 %159, %149, !dbg !12
181
+ %161 = zext nneg i32 %160 to i64, !dbg !12
182
+ %162 = getelementptr float, ptr addrspace(3) @global_smem, i64 %161, !dbg !12
183
+ %163 = insertelement <1 x float> undef, float %122, i64 0, !dbg !12
184
+ store <1 x float> %163, ptr addrspace(3) %162, align 4, !dbg !12
185
+ %164 = or i32 %19, 576, !dbg !12
186
+ %165 = add nuw nsw i32 %164, %149, !dbg !12
187
+ %166 = zext nneg i32 %165 to i64, !dbg !12
188
+ %167 = getelementptr float, ptr addrspace(3) @global_smem, i64 %166, !dbg !12
189
+ %168 = insertelement <1 x float> undef, float %123, i64 0, !dbg !12
190
+ store <1 x float> %168, ptr addrspace(3) %167, align 4, !dbg !12
191
+ tail call void @llvm.nvvm.barrier0(), !dbg !12
192
+ %169 = mul nuw nsw i32 %15, 12, !dbg !12
193
+ %170 = add nuw nsw i32 %169, %18, !dbg !12
194
+ %171 = zext nneg i32 %170 to i64, !dbg !12
195
+ %172 = getelementptr float, ptr addrspace(3) @global_smem, i64 %171, !dbg !12
196
+ %173 = load float, ptr addrspace(3) %172, align 16, !dbg !12
197
+ %174 = getelementptr inbounds <4 x float>, ptr addrspace(3) %172, i64 0, i64 1, !dbg !12
198
+ %175 = load float, ptr addrspace(3) %174, align 4, !dbg !12
199
+ %176 = getelementptr inbounds <4 x float>, ptr addrspace(3) %172, i64 0, i64 2, !dbg !12
200
+ %177 = load float, ptr addrspace(3) %176, align 8, !dbg !12
201
+ %178 = getelementptr inbounds <4 x float>, ptr addrspace(3) %172, i64 0, i64 3, !dbg !12
202
+ %179 = load float, ptr addrspace(3) %178, align 4, !dbg !12
203
+ %180 = fsub float %129, %128, !dbg !50
204
+ %181 = fadd float %173, %175, !dbg !54
205
+ %182 = fcmp oeq float %181, 0.000000e+00, !dbg !55
206
+ %183 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %175, float %181) #6, !dbg !56
207
+ %184 = select i1 %182, float 0.000000e+00, float %183, !dbg !57
208
+ %185 = fmul float %180, %184, !dbg !58
209
+ %186 = fadd float %128, %185, !dbg !59
210
+ %187 = fadd float %140, %141, !dbg !60
211
+ %188 = fmul float %180, %180, !dbg !61
212
+ %189 = fmul float %188, %173, !dbg !62
213
+ %190 = fmul float %189, %184, !dbg !63
214
+ %191 = fadd float %187, %190, !dbg !64
215
+ %192 = fsub float %130, %186, !dbg !50
216
+ %193 = fadd float %177, %181, !dbg !54
217
+ %194 = fcmp oeq float %193, 0.000000e+00, !dbg !55
218
+ %195 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %177, float %193) #6, !dbg !56
219
+ %196 = select i1 %194, float 0.000000e+00, float %195, !dbg !57
220
+ %197 = fmul float %196, %192, !dbg !58
221
+ %198 = fadd float %186, %197, !dbg !59
222
+ %199 = fadd float %142, %191, !dbg !60
223
+ %200 = fmul float %192, %192, !dbg !61
224
+ %201 = fmul float %181, %200, !dbg !62
225
+ %202 = fmul float %196, %201, !dbg !63
226
+ %203 = fadd float %199, %202, !dbg !64
227
+ %204 = fsub float %131, %198, !dbg !50
228
+ %205 = fadd float %179, %193, !dbg !54
229
+ %206 = fcmp oeq float %205, 0.000000e+00, !dbg !55
230
+ %207 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %179, float %205) #6, !dbg !56
231
+ %208 = select i1 %206, float 0.000000e+00, float %207, !dbg !57
232
+ %209 = fmul float %208, %204, !dbg !58
233
+ %210 = fadd float %198, %209, !dbg !59
234
+ %211 = fadd float %143, %203, !dbg !60
235
+ %212 = fmul float %204, %204, !dbg !61
236
+ %213 = fmul float %193, %212, !dbg !62
237
+ %214 = fmul float %208, %213, !dbg !63
238
+ %215 = fadd float %211, %214, !dbg !64
239
+ %216 = bitcast float %210 to i32, !dbg !65
240
+ %217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 1, i32 31), !dbg !65
241
+ %218 = bitcast i32 %217 to float, !dbg !65
242
+ %219 = bitcast float %215 to i32, !dbg !65
243
+ %220 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %219, i32 1, i32 31), !dbg !65
244
+ %221 = bitcast i32 %220 to float, !dbg !65
245
+ %222 = bitcast float %205 to i32, !dbg !65
246
+ %223 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %222, i32 1, i32 31), !dbg !65
247
+ %224 = bitcast i32 %223 to float, !dbg !65
248
+ %225 = fsub float %218, %210, !dbg !50
249
+ %226 = fadd float %205, %224, !dbg !54
250
+ %227 = fcmp oeq float %226, 0.000000e+00, !dbg !55
251
+ %228 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %224, float %226) #6, !dbg !56
252
+ %229 = select i1 %227, float 0.000000e+00, float %228, !dbg !57
253
+ %230 = fmul float %229, %225, !dbg !58
254
+ %231 = fadd float %210, %230, !dbg !59
255
+ %232 = fadd float %215, %221, !dbg !60
256
+ %233 = fmul float %225, %225, !dbg !61
257
+ %234 = fmul float %205, %233, !dbg !62
258
+ %235 = fmul float %229, %234, !dbg !63
259
+ %236 = fadd float %232, %235, !dbg !64
260
+ %237 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %236, float 2.560000e+02) #6, !dbg !67
261
+ %238 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %236, float 2.560000e+02) #6, !dbg !67
262
+ %239 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %236, float 2.560000e+02) #6, !dbg !67
263
+ %240 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %236, float 2.560000e+02) #6, !dbg !67
264
+ %241 = fadd float %237, 0x3EE4F8B580000000, !dbg !68
265
+ br label %242, !dbg !69
266
+
267
+ 242: ; preds = %146, %__nv_rsqrtf.exit
268
+ %243 = phi i32 [ 0, %146 ], [ %333, %__nv_rsqrtf.exit ]
269
+ %244 = or i32 %243, %18, !dbg !70
270
+ %245 = add i32 %244, %35, !dbg !71
271
+ %246 = sext i32 %245 to i64, !dbg !72
272
+ %247 = getelementptr float, ptr addrspace(1) %2, i64 %246, !dbg !72
273
+ %248 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %247, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
274
+ %249 = extractvalue { i32, i32, i32, i32 } %248, 0, !dbg !73
275
+ %250 = extractvalue { i32, i32, i32, i32 } %248, 1, !dbg !73
276
+ %251 = extractvalue { i32, i32, i32, i32 } %248, 2, !dbg !73
277
+ %252 = extractvalue { i32, i32, i32, i32 } %248, 3, !dbg !73
278
+ %253 = bitcast i32 %249 to float, !dbg !73
279
+ %254 = bitcast i32 %250 to float, !dbg !73
280
+ %255 = bitcast i32 %251 to float, !dbg !73
281
+ %256 = bitcast i32 %252 to float, !dbg !73
282
+ %257 = add i32 %244, %36, !dbg !74
283
+ %258 = sext i32 %257 to i64, !dbg !75
284
+ %259 = getelementptr i16, ptr addrspace(1) %3, i64 %258, !dbg !75
285
+ %260 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %259, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !76
286
+ %261 = extractvalue { i32, i32 } %260, 0, !dbg !76
287
+ %262 = extractvalue { i32, i32 } %260, 1, !dbg !76
288
+ %263 = trunc i32 %261 to i16, !dbg !76
289
+ %extelt.offset = lshr i32 %261, 16, !dbg !76
290
+ %264 = trunc i32 %extelt.offset to i16, !dbg !76
291
+ %265 = trunc i32 %262 to i16, !dbg !76
292
+ %extelt.offset2 = lshr i32 %262, 16, !dbg !76
293
+ %266 = trunc i32 %extelt.offset2 to i16, !dbg !76
294
+ %267 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %263) #6, !dbg !77
295
+ %268 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %264) #6, !dbg !77
296
+ %269 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %265) #6, !dbg !77
297
+ %270 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %266) #6, !dbg !77
298
+ %271 = zext nneg i32 %244 to i64, !dbg !78
299
+ %272 = getelementptr float, ptr addrspace(1) %4, i64 %271, !dbg !78
300
+ %273 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %272, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !79
301
+ %274 = extractvalue { i32, i32, i32, i32 } %273, 0, !dbg !79
302
+ %275 = extractvalue { i32, i32, i32, i32 } %273, 1, !dbg !79
303
+ %276 = extractvalue { i32, i32, i32, i32 } %273, 2, !dbg !79
304
+ %277 = extractvalue { i32, i32, i32, i32 } %273, 3, !dbg !79
305
+ %278 = bitcast i32 %274 to float, !dbg !79
306
+ %279 = bitcast i32 %275 to float, !dbg !79
307
+ %280 = bitcast i32 %276 to float, !dbg !79
308
+ %281 = bitcast i32 %277 to float, !dbg !79
309
+ br i1 %41, label %282, label %283, !dbg !80
310
+
311
+ 282: ; preds = %242
312
+ tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !80
313
+ br label %283, !dbg !80
314
+
315
+ 283: ; preds = %282, %242
316
+ %284 = getelementptr float, ptr addrspace(1) %45, i64 %271, !dbg !81
317
+ %285 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %284, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82
318
+ %286 = extractvalue { i32, i32, i32, i32 } %285, 0, !dbg !82
319
+ %287 = extractvalue { i32, i32, i32, i32 } %285, 1, !dbg !82
320
+ %288 = extractvalue { i32, i32, i32, i32 } %285, 2, !dbg !82
321
+ %289 = extractvalue { i32, i32, i32, i32 } %285, 3, !dbg !82
322
+ %290 = bitcast i32 %286 to float, !dbg !82
323
+ %291 = bitcast i32 %287 to float, !dbg !82
324
+ %292 = bitcast i32 %288 to float, !dbg !82
325
+ %293 = bitcast i32 %289 to float, !dbg !82
326
+ %294 = fadd float %253, %290, !dbg !83
327
+ %295 = fadd float %254, %291, !dbg !83
328
+ %296 = fadd float %255, %292, !dbg !83
329
+ %297 = fadd float %256, %293, !dbg !83
330
+ %298 = fadd float %267, %294, !dbg !84
331
+ %299 = fadd float %268, %295, !dbg !84
332
+ %300 = fadd float %269, %296, !dbg !84
333
+ %301 = fadd float %270, %297, !dbg !84
334
+ %302 = fsub float %298, %231, !dbg !85
335
+ %303 = fsub float %299, %231, !dbg !85
336
+ %304 = fsub float %300, %231, !dbg !85
337
+ %305 = fsub float %301, %231, !dbg !85
338
+ %306 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
339
+ %.not.i = icmp eq i32 %306, 0, !dbg !86
340
+ br i1 %.not.i, label %309, label %307, !dbg !86
341
+
342
+ 307: ; preds = %283
343
+ %308 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %241), !dbg !86
344
+ br label %__nv_rsqrtf.exit, !dbg !86
345
+
346
+ 309: ; preds = %283
347
+ %310 = tail call float @llvm.nvvm.rsqrt.approx.f(float %241), !dbg !86
348
+ br label %__nv_rsqrtf.exit, !dbg !86
349
+
350
+ __nv_rsqrtf.exit: ; preds = %307, %309
351
+ %.0.i = phi float [ %308, %307 ], [ %310, %309 ], !dbg !86
352
+ %311 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
353
+ %312 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
354
+ %313 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
355
+ %314 = fmul float %302, %.0.i, !dbg !87
356
+ %315 = fmul float %303, %.0.i, !dbg !87
357
+ %316 = fmul float %304, %.0.i, !dbg !87
358
+ %317 = fmul float %305, %.0.i, !dbg !87
359
+ %318 = fmul float %314, %278, !dbg !88
360
+ %319 = fmul float %315, %279, !dbg !88
361
+ %320 = fmul float %316, %280, !dbg !88
362
+ %321 = fmul float %317, %281, !dbg !88
363
+ %322 = getelementptr i16, ptr addrspace(1) %5, i64 %258, !dbg !89
364
+ %323 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %318) #6, !dbg !90
365
+ %324 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %319) #6, !dbg !90
366
+ %325 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %320) #6, !dbg !90
367
+ %326 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %321) #6, !dbg !90
368
+ %327 = insertelement <2 x i16> undef, i16 %323, i64 0, !dbg !90
369
+ %328 = insertelement <2 x i16> %327, i16 %324, i64 1, !dbg !90
370
+ %329 = bitcast <2 x i16> %328 to i32, !dbg !90
371
+ %330 = insertelement <2 x i16> undef, i16 %325, i64 0, !dbg !90
372
+ %331 = insertelement <2 x i16> %330, i16 %326, i64 1, !dbg !90
373
+ %332 = bitcast <2 x i16> %331 to i32, !dbg !90
374
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %329, i32 %332, ptr addrspace(1) %322, i1 true) #6, !dbg !90
375
+ %333 = add nuw nsw i32 %243, 8, !dbg !69
376
+ %334 = icmp ult i32 %243, 248, !dbg !69
377
+ br i1 %334, label %242, label %335, !dbg !69
378
+
379
+ 335: ; preds = %__nv_rsqrtf.exit
380
+ ret void, !dbg !91
381
+ }
382
+
383
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
384
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
385
+
386
+ ; Function Attrs: convergent nocallback nounwind
387
+ declare void @llvm.nvvm.barrier0() #1
388
+
389
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
390
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
391
+
392
+ ; Function Attrs: alwaysinline nounwind
393
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
394
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
395
+ %.not = icmp eq i32 %1, 0
396
+ br i1 %.not, label %4, label %2
397
+
398
+ 2: ; preds = %0
399
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
400
+ br label %6
401
+
402
+ 4: ; preds = %0
403
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
404
+ br label %6
405
+
406
+ 6: ; preds = %4, %2
407
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
408
+ ret float %.0
409
+ }
410
+
411
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
412
+
413
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
414
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
415
+
416
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
417
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
418
+
419
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
420
+ attributes #1 = { convergent nocallback nounwind }
421
+ attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
422
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
423
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
424
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
425
+ attributes #6 = { nounwind }
426
+
427
+ !llvm.module.flags = !{!0, !1}
428
+ !llvm.dbg.cu = !{!2}
429
+ !nvvm.annotations = !{!4, !5, !5, !4}
430
+ !llvm.ident = !{!6}
431
+
432
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
433
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
434
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
435
+ !3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn")
436
+ !4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
437
+ !5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 128}
438
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
439
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
440
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
441
+ !9 = !{}
442
+ !10 = !DILocation(line: 22, column: 44, scope: !7)
443
+ !11 = !DILocation(line: 24, column: 33, scope: !7)
444
+ !12 = !DILocation(line: 31, column: 36, scope: !7)
445
+ !13 = !DILocation(line: 21, column: 28, scope: !7)
446
+ !14 = !DILocation(line: 21, column: 33, scope: !7)
447
+ !15 = !DILocation(line: 22, column: 23, scope: !7)
448
+ !16 = !DILocation(line: 26, column: 30, scope: !7)
449
+ !17 = !DILocation(line: 26, column: 35, scope: !7)
450
+ !18 = !DILocation(line: 27, column: 18, scope: !7)
451
+ !19 = !DILocation(line: 35, column: 44, scope: !7)
452
+ !20 = !DILocation(line: 36, column: 44, scope: !7)
453
+ !21 = !DILocation(line: 37, column: 22, scope: !7)
454
+ !22 = !DILocation(line: 38, column: 22, scope: !7)
455
+ !23 = !DILocation(line: 39, column: 36, scope: !7)
456
+ !24 = !DILocation(line: 40, column: 40, scope: !7)
457
+ !25 = !DILocation(line: 41, column: 44, scope: !7)
458
+ !26 = !DILocation(line: 32, column: 27, scope: !7)
459
+ !27 = !DILocation(line: 35, column: 40, scope: !7)
460
+ !28 = !DILocation(line: 35, column: 34, scope: !7)
461
+ !29 = !DILocation(line: 35, column: 50, scope: !7)
462
+ !30 = !DILocation(line: 36, column: 40, scope: !7)
463
+ !31 = !DILocation(line: 36, column: 34, scope: !7)
464
+ !32 = !DILocation(line: 36, column: 50, scope: !7)
465
+ !33 = !DILocation(line: 36, column: 101, scope: !7)
466
+ !34 = !DILocation(line: 40, column: 55, scope: !7)
467
+ !35 = !DILocation(line: 41, column: 40, scope: !7)
468
+ !36 = !DILocation(line: 41, column: 34, scope: !7)
469
+ !37 = !DILocation(line: 41, column: 52, scope: !7)
470
+ !38 = !DILocation(line: 42, column: 22, scope: !7)
471
+ !39 = !DILocation(line: 44, column: 22, scope: !7)
472
+ !40 = !DILocation(line: 96, column: 20, scope: !41, inlinedAt: !43)
473
+ !41 = distinct !DILexicalBlockFile(scope: !7, file: !42, discriminator: 0)
474
+ !42 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
475
+ !43 = !DILocation(line: 47, column: 41, scope: !41)
476
+ !44 = !DILocation(line: 97, column: 26, scope: !41, inlinedAt: !43)
477
+ !45 = !DILocation(line: 98, column: 30, scope: !41, inlinedAt: !43)
478
+ !46 = !DILocation(line: 98, column: 22, scope: !41, inlinedAt: !43)
479
+ !47 = !DILocation(line: 101, column: 30, scope: !41, inlinedAt: !43)
480
+ !48 = !DILocation(line: 101, column: 22, scope: !41, inlinedAt: !43)
481
+ !49 = !DILocation(line: 50, column: 50, scope: !7)
482
+ !50 = !DILocation(line: 108, column: 21, scope: !51, inlinedAt: !52)
483
+ !51 = distinct !DILexicalBlockFile(scope: !41, file: !42, discriminator: 0)
484
+ !52 = !DILocation(line: 120, column: 46, scope: !51, inlinedAt: !53)
485
+ !53 = !DILocation(line: 53, column: 44, scope: !51)
486
+ !54 = !DILocation(line: 109, column: 28, scope: !51, inlinedAt: !52)
487
+ !55 = !DILocation(line: 110, column: 39, scope: !51, inlinedAt: !52)
488
+ !56 = !DILocation(line: 110, column: 60, scope: !51, inlinedAt: !52)
489
+ !57 = !DILocation(line: 110, column: 49, scope: !51, inlinedAt: !52)
490
+ !58 = !DILocation(line: 112, column: 25, scope: !51, inlinedAt: !52)
491
+ !59 = !DILocation(line: 112, column: 17, scope: !51, inlinedAt: !52)
492
+ !60 = !DILocation(line: 113, column: 15, scope: !51, inlinedAt: !52)
493
+ !61 = !DILocation(line: 113, column: 30, scope: !51, inlinedAt: !52)
494
+ !62 = !DILocation(line: 113, column: 38, scope: !51, inlinedAt: !52)
495
+ !63 = !DILocation(line: 113, column: 49, scope: !51, inlinedAt: !52)
496
+ !64 = !DILocation(line: 113, column: 22, scope: !51, inlinedAt: !52)
497
+ !65 = !DILocation(line: 120, column: 46, scope: !41, inlinedAt: !66)
498
+ !66 = !DILocation(line: 53, column: 44, scope: !41)
499
+ !67 = !DILocation(line: 75, column: 24, scope: !7)
500
+ !68 = !DILocation(line: 77, column: 24, scope: !7)
501
+ !69 = !DILocation(line: 58, column: 36, scope: !7)
502
+ !70 = !DILocation(line: 59, column: 27, scope: !7)
503
+ !71 = !DILocation(line: 62, column: 41, scope: !7)
504
+ !72 = !DILocation(line: 62, column: 35, scope: !7)
505
+ !73 = !DILocation(line: 62, column: 51, scope: !7)
506
+ !74 = !DILocation(line: 63, column: 41, scope: !7)
507
+ !75 = !DILocation(line: 63, column: 35, scope: !7)
508
+ !76 = !DILocation(line: 63, column: 51, scope: !7)
509
+ !77 = !DILocation(line: 63, column: 103, scope: !7)
510
+ !78 = !DILocation(line: 64, column: 35, scope: !7)
511
+ !79 = !DILocation(line: 64, column: 40, scope: !7)
512
+ !80 = !DILocation(line: 68, column: 57, scope: !7)
513
+ !81 = !DILocation(line: 69, column: 35, scope: !7)
514
+ !82 = !DILocation(line: 69, column: 54, scope: !7)
515
+ !83 = !DILocation(line: 70, column: 24, scope: !7)
516
+ !84 = !DILocation(line: 72, column: 24, scope: !7)
517
+ !85 = !DILocation(line: 73, column: 24, scope: !7)
518
+ !86 = !DILocation(line: 78, column: 30, scope: !7)
519
+ !87 = !DILocation(line: 79, column: 24, scope: !7)
520
+ !88 = !DILocation(line: 80, column: 24, scope: !7)
521
+ !89 = !DILocation(line: 82, column: 29, scope: !7)
522
+ !90 = !DILocation(line: 82, column: 52, scope: !7)
523
+ !91 = !DILocation(line: 58, column: 4, scope: !7)
.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ttgir ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
4
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
5
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
6
+ %cst = arith.constant dense<512> : tensor<64x1xi32, #blocked>
7
+ %cst_0 = arith.constant dense<256> : tensor<1x8xi32, #blocked>
8
+ %cst_1 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
9
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked>
10
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x8xf32, #blocked>
11
+ %cst_4 = arith.constant dense<1.000000e+00> : tensor<64x8xf32, #blocked>
12
+ %cst_5 = arith.constant dense<256> : tensor<64x1xi64, #blocked>
13
+ %cst_6 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
14
+ %cst_7 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
15
+ %cst_8 = arith.constant dense<50257> : tensor<64x1xi64, #blocked1>
16
+ %cst_9 = arith.constant dense<0> : tensor<64x1xi64, #blocked1>
17
+ %c0_i32 = arith.constant 0 : i32
18
+ %c8_i32 = arith.constant 8 : i32
19
+ %c256_i32 = arith.constant 256 : i32
20
+ %cst_10 = arith.constant dense<1.000000e+00> : tensor<64x8xf32, #blocked2>
21
+ %cst_11 = arith.constant 0.000000e+00 : f32
22
+ %cst_12 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked2>
23
+ %cst_13 = arith.constant dense<256> : tensor<1x8xi32, #blocked2>
24
+ %cst_14 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32, #blocked>
25
+ %cst_15 = arith.constant dense<2.560000e+02> : tensor<64x1xf32, #blocked>
26
+ %cst_16 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked>
27
+ %c64_i32 = arith.constant 64 : i32
28
+ %0 = tt.get_program_id x : i32
29
+ %1 = arith.muli %0, %c64_i32 : i32
30
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
31
+ %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
32
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
33
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
34
+ %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
35
+ %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
36
+ %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
37
+ %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
38
+ %10 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
39
+ %11 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
40
+ %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked>
41
+ %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x8xi32, #blocked2>
42
+ %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
43
+ %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked1>
44
+ %16 = tt.addptr %14, %8 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
45
+ %17 = tt.addptr %15, %9 : tensor<64x1x!tt.ptr<i64, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
46
+ %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
47
+ %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked1>
48
+ %20 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
49
+ %21 = arith.muli %20, %cst_1 : tensor<64x1xi32, #blocked>
50
+ %22 = tt.broadcast %21 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
51
+ %23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
52
+ %24 = arith.muli %8, %cst_1 : tensor<64x1xi32, #blocked>
53
+ %25 = tt.broadcast %24 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
54
+ %26 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
55
+ %27 = arith.addi %18, %cst_7 : tensor<64x1xi64, #blocked>
56
+ %28 = arith.addi %19, %cst_8 : tensor<64x1xi64, #blocked1>
57
+ %29 = arith.cmpi slt, %18, %cst_6 : tensor<64x1xi64, #blocked>
58
+ %30 = arith.cmpi slt, %19, %cst_9 : tensor<64x1xi64, #blocked1>
59
+ %31 = arith.select %29, %27, %18 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
60
+ %32 = arith.select %30, %28, %19 : tensor<64x1xi1, #blocked1>, tensor<64x1xi64, #blocked1>
61
+ %33 = arith.cmpi sge, %32, %cst_9 : tensor<64x1xi64, #blocked1>
62
+ %34 = arith.cmpi slt, %32, %cst_8 : tensor<64x1xi64, #blocked1>
63
+ %35 = arith.andi %33, %34 : tensor<64x1xi1, #blocked1>
64
+ %36 = arith.muli %31, %cst_5 : tensor<64x1xi64, #blocked>
65
+ %37 = tt.broadcast %36 : (tensor<64x1xi64, #blocked>) -> tensor<64x8xi64, #blocked>
66
+ %38 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
67
+ %39:4 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c8_i32 iter_args(%arg9 = %cst_2, %arg10 = %cst_2, %arg11 = %cst_12, %arg12 = %cst_2) -> (tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked2>, tensor<64x8xf32, #blocked>) : i32 {
68
+ %49 = tt.splat %arg8 : (i32) -> tensor<1x8xi32, #blocked>
69
+ %50 = tt.splat %arg8 : (i32) -> tensor<1x8xi32, #blocked2>
70
+ %51 = arith.addi %49, %12 : tensor<1x8xi32, #blocked>
71
+ %52 = arith.addi %50, %13 : tensor<1x8xi32, #blocked2>
72
+ %53 = arith.cmpi slt, %51, %cst_0 : tensor<1x8xi32, #blocked>
73
+ %54 = arith.cmpi slt, %52, %cst_13 : tensor<1x8xi32, #blocked2>
74
+ %55 = tt.broadcast %51 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
75
+ %56 = arith.addi %55, %22 : tensor<64x8xi32, #blocked>
76
+ %57 = tt.addptr %23, %56 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
77
+ %58 = tt.broadcast %53 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
78
+ %59 = tt.broadcast %54 : (tensor<1x8xi1, #blocked2>) -> tensor<64x8xi1, #blocked2>
79
+ %60 = tt.load %57, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
80
+ %61 = arith.addi %55, %25 : tensor<64x8xi32, #blocked>
81
+ %62 = tt.addptr %26, %61 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
82
+ %63 = tt.load %62, %58, %cst_16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
83
+ %64 = arith.extf %63 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked>
84
+ tt.assert %35, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
85
+ %65 = arith.extsi %51 : tensor<1x8xi32, #blocked> to tensor<1x8xi64, #blocked>
86
+ %66 = tt.broadcast %65 : (tensor<1x8xi64, #blocked>) -> tensor<64x8xi64, #blocked>
87
+ %67 = arith.addi %66, %37 : tensor<64x8xi64, #blocked>
88
+ %68 = tt.addptr %38, %67 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi64, #blocked>
89
+ %69 = tt.load %68, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
90
+ %70 = arith.addf %69, %60 : tensor<64x8xf32, #blocked>
91
+ %71 = arith.addf %70, %64 : tensor<64x8xf32, #blocked>
92
+ %72 = arith.subf %71, %arg9 : tensor<64x8xf32, #blocked>
93
+ %73 = arith.addf %arg12, %cst_4 : tensor<64x8xf32, #blocked>
94
+ %74 = arith.addf %arg11, %cst_10 : tensor<64x8xf32, #blocked2>
95
+ %75 = arith.divf %72, %73 : tensor<64x8xf32, #blocked>
96
+ %76 = arith.addf %arg9, %75 : tensor<64x8xf32, #blocked>
97
+ %77 = arith.subf %71, %76 : tensor<64x8xf32, #blocked>
98
+ %78 = arith.mulf %72, %77 : tensor<64x8xf32, #blocked>
99
+ %79 = arith.addf %arg10, %78 : tensor<64x8xf32, #blocked>
100
+ %80 = arith.select %58, %76, %arg9 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
101
+ %81 = arith.select %58, %79, %arg10 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
102
+ %82 = arith.select %58, %73, %arg12 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
103
+ %83 = arith.select %59, %74, %arg11 : tensor<64x8xi1, #blocked2>, tensor<64x8xf32, #blocked2>
104
+ scf.yield %80, %81, %83, %82 : tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked2>, tensor<64x8xf32, #blocked>
105
+ }
106
+ %40 = triton_gpu.convert_layout %39#2 : (tensor<64x8xf32, #blocked2>) -> tensor<64x8xf32, #blocked>
107
+ %41:3 = "tt.reduce"(%39#0, %39#1, %40) <{axis = 1 : i32}> ({
108
+ ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
109
+ %49 = arith.subf %arg11, %arg8 : f32
110
+ %50 = arith.addf %arg10, %arg13 : f32
111
+ %51 = arith.cmpf oeq, %50, %cst_11 : f32
112
+ %52 = arith.divf %arg13, %50 : f32
113
+ %53 = arith.select %51, %cst_11, %52 : f32
114
+ %54 = arith.mulf %49, %53 : f32
115
+ %55 = arith.addf %arg8, %54 : f32
116
+ %56 = arith.addf %arg9, %arg12 : f32
117
+ %57 = arith.mulf %49, %49 : f32
118
+ %58 = arith.mulf %57, %arg10 : f32
119
+ %59 = arith.mulf %58, %53 : f32
120
+ %60 = arith.addf %56, %59 : f32
121
+ tt.reduce.return %55, %60, %50 : f32, f32, f32
122
+ }) : (tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
123
+ %42 = tt.expand_dims %41#0 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
124
+ %43 = tt.expand_dims %41#1 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
125
+ %44 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x8x!tt.ptr<f32, 1>, #blocked>
126
+ %45 = tt.broadcast %42 : (tensor<64x1xf32, #blocked>) -> tensor<64x8xf32, #blocked>
127
+ %46 = arith.divf %43, %cst_15 : tensor<64x1xf32, #blocked>
128
+ %47 = arith.addf %46, %cst_14 : tensor<64x1xf32, #blocked>
129
+ %48 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
130
+ scf.for %arg8 = %c0_i32 to %c256_i32 step %c8_i32 : i32 {
131
+ %49 = tt.splat %arg8 : (i32) -> tensor<1x8xi32, #blocked>
132
+ %50 = arith.addi %49, %12 : tensor<1x8xi32, #blocked>
133
+ %51 = arith.cmpi slt, %50, %cst_0 : tensor<1x8xi32, #blocked>
134
+ %52 = tt.broadcast %50 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
135
+ %53 = arith.addi %52, %22 : tensor<64x8xi32, #blocked>
136
+ %54 = tt.addptr %23, %53 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
137
+ %55 = tt.broadcast %51 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
138
+ %56 = tt.load %54, %55, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
139
+ %57 = arith.addi %52, %25 : tensor<64x8xi32, #blocked>
140
+ %58 = tt.addptr %26, %57 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
141
+ %59 = tt.load %58, %55, %cst_16 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
142
+ %60 = arith.extf %59 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked>
143
+ %61 = tt.addptr %44, %50 : tensor<1x8x!tt.ptr<f32, 1>, #blocked>, tensor<1x8xi32, #blocked>
144
+ %62 = tt.load %61, %51, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x8xf32, #blocked>
145
+ tt.assert %35, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
146
+ %63 = arith.extsi %50 : tensor<1x8xi32, #blocked> to tensor<1x8xi64, #blocked>
147
+ %64 = tt.broadcast %63 : (tensor<1x8xi64, #blocked>) -> tensor<64x8xi64, #blocked>
148
+ %65 = arith.addi %64, %37 : tensor<64x8xi64, #blocked>
149
+ %66 = tt.addptr %38, %65 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi64, #blocked>
150
+ %67 = tt.load %66, %55, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
151
+ %68 = arith.addf %67, %56 : tensor<64x8xf32, #blocked>
152
+ %69 = arith.addf %68, %60 : tensor<64x8xf32, #blocked>
153
+ %70 = arith.subf %69, %45 : tensor<64x8xf32, #blocked>
154
+ %71 = tt.extern_elementwise %47 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked>) -> tensor<64x1xf32, #blocked>
155
+ %72 = tt.broadcast %71 : (tensor<64x1xf32, #blocked>) -> tensor<64x8xf32, #blocked>
156
+ %73 = arith.mulf %70, %72 : tensor<64x8xf32, #blocked>
157
+ %74 = tt.broadcast %62 : (tensor<1x8xf32, #blocked>) -> tensor<64x8xf32, #blocked>
158
+ %75 = arith.mulf %73, %74 : tensor<64x8xf32, #blocked>
159
+ %76 = tt.addptr %48, %57 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
160
+ %77 = arith.truncf %75 : tensor<64x8xf32, #blocked> to tensor<64x8xbf16, #blocked>
161
+ tt.store %76, %77, %55 {cache = 1 : i32, evict = 1 : i32} : tensor<64x8xbf16, #blocked>
162
+ }
163
+ tt.return
164
+ }
165
+ }
.triton/dump/0ef13ec90cf21db4d33a072ff09ec2d4/triton_.cubin ADDED
Binary file (18.3 kB). View file
 
.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ptx ADDED
@@ -0,0 +1,764 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1de
10
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
11
+
12
+ .visible .entry triton__0d1de(
13
+ .param .u64 triton__0d1de_param_0,
14
+ .param .u32 triton__0d1de_param_1
15
+ )
16
+ .maxntid 128, 1, 1
17
+ {
18
+ .reg .pred %p<27>;
19
+ .reg .b16 %rs<17>;
20
+ .reg .b32 %r<67>;
21
+ .reg .f32 %f<431>;
22
+ .reg .b64 %rd<6>;
23
+ .loc 1 18 0
24
+ $L__func_begin0:
25
+ .loc 1 18 0
26
+
27
+ ld.param.u64 %rd3, [triton__0d1de_param_0];
28
+ $L__tmp0:
29
+ .loc 1 21 36
30
+ mov.u32 %r14, %tid.x;
31
+ shl.b32 %r15, %r14, 3;
32
+ and.b32 %r16, %r15, 1016;
33
+ .loc 1 20 28
34
+ mov.u32 %r1, %ctaid.x;
35
+ .loc 1 20 33
36
+ shl.b32 %r17, %r1, 10;
37
+ .loc 1 21 23
38
+ or.b32 %r18, %r17, %r16;
39
+ .loc 1 24 34
40
+ mul.wide.s32 %rd4, %r18, 2;
41
+ add.s64 %rd5, %rd3, %rd4;
42
+ mov.pred %p1, -1;
43
+ .loc 1 24 39
44
+ mov.u32 %r2, 0x0;
45
+ mov.u32 %r3, 0x0;
46
+ mov.u32 %r4, 0x0;
47
+ mov.u32 %r5, 0x0;
48
+ @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd5 + 0 ];
49
+ cvt.u16.u32 %rs1, %r2;
50
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
51
+ cvt.u16.u32 %rs3, %r3;
52
+ .loc 1 24 48
53
+ cvt.f32.bf16 %r6, %rs1;
54
+ mov.b32 %f1, %r6;
55
+ cvt.f32.bf16 %r7, %rs2;
56
+ mov.b32 %f2, %r7;
57
+ .loc 1 29 18
58
+ mul.f32 %f9, %f1, 0f3F3504F3;
59
+ .loc 1 30 23
60
+ abs.ftz.f32 %f17, %f9;
61
+ setp.ge.f32 %p2, %f17, 0f3F8060FE;
62
+ mov.f32 %f365, 0f3789CA3C;
63
+ mov.f32 %f364, 0fB9F560B9;
64
+ mov.f32 %f363, 0f3BAC840B;
65
+ mov.f32 %f362, 0fBD0C8162;
66
+ mov.f32 %f361, 0f3E1CF906;
67
+ mov.f32 %f360, 0f3F6A937E;
68
+ mov.f32 %f359, 0f3F20D842;
69
+ mov.f32 %f366, %f17;
70
+ @%p2 bra $L__BB0_2;
71
+ .loc 1 0 23
72
+ mov.f32 %f365, 0f38B1E96A;
73
+ mov.f32 %f364, 0fBA574D20;
74
+ mov.f32 %f363, 0f3BAAD5EA;
75
+ mov.f32 %f362, 0fBCDC1BE7;
76
+ mov.f32 %f361, 0f3DE718AF;
77
+ mov.f32 %f360, 0fBEC093AC;
78
+ mov.f32 %f359, 0f3E0375D3;
79
+ .loc 1 30 23
80
+ mul.f32 %f366, %f9, %f9;
81
+ $L__BB0_2:
82
+ .loc 1 0 0
83
+ cvt.f32.bf16 %r8, %rs3;
84
+ mul.f32 %f10, %f2, 0f3F3504F3;
85
+ .loc 1 30 23
86
+ setp.ltu.f32 %p3, %f17, 0f3F8060FE;
87
+ fma.rn.ftz.f32 %f135, %f365, %f366, %f364;
88
+ fma.rn.ftz.f32 %f136, %f135, %f366, %f363;
89
+ fma.rn.ftz.f32 %f137, %f136, %f366, %f362;
90
+ fma.rn.ftz.f32 %f138, %f137, %f366, %f361;
91
+ fma.rn.ftz.f32 %f139, %f138, %f366, %f360;
92
+ fma.rn.ftz.f32 %f140, %f139, %f366, %f359;
93
+ neg.f32 %f141, %f366;
94
+ selp.f32 %f142, %f141, %f9, %p2;
95
+ fma.rn.ftz.f32 %f367, %f140, %f142, %f142;
96
+ mov.f32 %f358, 0f3F800000;
97
+ @%p3 bra $L__BB0_4;
98
+ ex2.approx.ftz.f32 %f143, %f367;
99
+ sub.f32 %f145, %f358, %f143;
100
+ mov.b32 %r19, %f145;
101
+ mov.b32 %r20, %f9;
102
+ and.b32 %r21, %r20, -2147483648;
103
+ or.b32 %r22, %r21, %r19;
104
+ mov.b32 %f367, %r22;
105
+ $L__BB0_4:
106
+ .loc 1 0 0
107
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
108
+ mov.b32 %f3, %r8;
109
+ .loc 1 30 23
110
+ abs.ftz.f32 %f30, %f10;
111
+ setp.ge.f32 %p5, %f30, 0f3F8060FE;
112
+ mov.f32 %f374, 0f3789CA3C;
113
+ mov.f32 %f373, 0fB9F560B9;
114
+ mov.f32 %f372, 0f3BAC840B;
115
+ mov.f32 %f371, 0fBD0C8162;
116
+ mov.f32 %f370, 0f3E1CF906;
117
+ mov.f32 %f369, 0f3F6A937E;
118
+ mov.f32 %f368, 0f3F20D842;
119
+ mov.f32 %f375, %f30;
120
+ @%p5 bra $L__BB0_6;
121
+ mul.f32 %f375, %f10, %f10;
122
+ mov.f32 %f374, 0f38B1E96A;
123
+ mov.f32 %f373, 0fBA574D20;
124
+ mov.f32 %f372, 0f3BAAD5EA;
125
+ mov.f32 %f371, 0fBCDC1BE7;
126
+ mov.f32 %f370, 0f3DE718AF;
127
+ mov.f32 %f369, 0fBEC093AC;
128
+ mov.f32 %f368, 0f3E0375D3;
129
+ $L__BB0_6:
130
+ .loc 1 0 0
131
+ cvt.f32.bf16 %r9, %rs4;
132
+ mul.f32 %f11, %f3, 0f3F3504F3;
133
+ .loc 1 30 23
134
+ setp.ltu.f32 %p6, %f30, 0f3F8060FE;
135
+ fma.rn.ftz.f32 %f160, %f374, %f375, %f373;
136
+ fma.rn.ftz.f32 %f161, %f160, %f375, %f372;
137
+ fma.rn.ftz.f32 %f162, %f161, %f375, %f371;
138
+ fma.rn.ftz.f32 %f163, %f162, %f375, %f370;
139
+ fma.rn.ftz.f32 %f164, %f163, %f375, %f369;
140
+ fma.rn.ftz.f32 %f165, %f164, %f375, %f368;
141
+ neg.f32 %f166, %f375;
142
+ selp.f32 %f167, %f166, %f10, %p5;
143
+ fma.rn.ftz.f32 %f376, %f165, %f167, %f167;
144
+ @%p6 bra $L__BB0_8;
145
+ ex2.approx.ftz.f32 %f168, %f376;
146
+ sub.f32 %f170, %f358, %f168;
147
+ mov.b32 %r23, %f170;
148
+ mov.b32 %r24, %f10;
149
+ and.b32 %r25, %r24, -2147483648;
150
+ or.b32 %r26, %r25, %r23;
151
+ mov.b32 %f376, %r26;
152
+ $L__BB0_8:
153
+ .loc 1 0 0
154
+ cvt.u16.u32 %rs5, %r4;
155
+ mov.b32 %f4, %r9;
156
+ .loc 1 30 23
157
+ abs.ftz.f32 %f43, %f11;
158
+ setp.ge.f32 %p8, %f43, 0f3F8060FE;
159
+ mov.f32 %f383, 0f3789CA3C;
160
+ mov.f32 %f382, 0fB9F560B9;
161
+ mov.f32 %f381, 0f3BAC840B;
162
+ mov.f32 %f380, 0fBD0C8162;
163
+ mov.f32 %f379, 0f3E1CF906;
164
+ mov.f32 %f378, 0f3F6A937E;
165
+ mov.f32 %f377, 0f3F20D842;
166
+ mov.f32 %f384, %f43;
167
+ @%p8 bra $L__BB0_10;
168
+ mul.f32 %f384, %f11, %f11;
169
+ mov.f32 %f383, 0f38B1E96A;
170
+ mov.f32 %f382, 0fBA574D20;
171
+ mov.f32 %f381, 0f3BAAD5EA;
172
+ mov.f32 %f380, 0fBCDC1BE7;
173
+ mov.f32 %f379, 0f3DE718AF;
174
+ mov.f32 %f378, 0fBEC093AC;
175
+ mov.f32 %f377, 0f3E0375D3;
176
+ $L__BB0_10:
177
+ .loc 1 0 0
178
+ cvt.f32.bf16 %r10, %rs5;
179
+ mul.f32 %f12, %f4, 0f3F3504F3;
180
+ .loc 1 30 23
181
+ setp.ltu.f32 %p9, %f43, 0f3F8060FE;
182
+ fma.rn.ftz.f32 %f185, %f383, %f384, %f382;
183
+ fma.rn.ftz.f32 %f186, %f185, %f384, %f381;
184
+ fma.rn.ftz.f32 %f187, %f186, %f384, %f380;
185
+ fma.rn.ftz.f32 %f188, %f187, %f384, %f379;
186
+ fma.rn.ftz.f32 %f189, %f188, %f384, %f378;
187
+ fma.rn.ftz.f32 %f190, %f189, %f384, %f377;
188
+ neg.f32 %f191, %f384;
189
+ selp.f32 %f192, %f191, %f11, %p8;
190
+ fma.rn.ftz.f32 %f385, %f190, %f192, %f192;
191
+ @%p9 bra $L__BB0_12;
192
+ ex2.approx.ftz.f32 %f193, %f385;
193
+ sub.f32 %f195, %f358, %f193;
194
+ mov.b32 %r27, %f195;
195
+ mov.b32 %r28, %f11;
196
+ and.b32 %r29, %r28, -2147483648;
197
+ or.b32 %r30, %r29, %r27;
198
+ mov.b32 %f385, %r30;
199
+ $L__BB0_12:
200
+ .loc 1 0 0
201
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r4; }
202
+ mov.b32 %f5, %r10;
203
+ .loc 1 30 23
204
+ abs.ftz.f32 %f56, %f12;
205
+ setp.ge.f32 %p11, %f56, 0f3F8060FE;
206
+ mov.f32 %f392, 0f3789CA3C;
207
+ mov.f32 %f391, 0fB9F560B9;
208
+ mov.f32 %f390, 0f3BAC840B;
209
+ mov.f32 %f389, 0fBD0C8162;
210
+ mov.f32 %f388, 0f3E1CF906;
211
+ mov.f32 %f387, 0f3F6A937E;
212
+ mov.f32 %f386, 0f3F20D842;
213
+ mov.f32 %f393, %f56;
214
+ @%p11 bra $L__BB0_14;
215
+ mul.f32 %f393, %f12, %f12;
216
+ mov.f32 %f392, 0f38B1E96A;
217
+ mov.f32 %f391, 0fBA574D20;
218
+ mov.f32 %f390, 0f3BAAD5EA;
219
+ mov.f32 %f389, 0fBCDC1BE7;
220
+ mov.f32 %f388, 0f3DE718AF;
221
+ mov.f32 %f387, 0fBEC093AC;
222
+ mov.f32 %f386, 0f3E0375D3;
223
+ $L__BB0_14:
224
+ .loc 1 0 0
225
+ cvt.f32.bf16 %r11, %rs6;
226
+ mul.f32 %f13, %f5, 0f3F3504F3;
227
+ .loc 1 30 23
228
+ setp.ltu.f32 %p12, %f56, 0f3F8060FE;
229
+ fma.rn.ftz.f32 %f210, %f392, %f393, %f391;
230
+ fma.rn.ftz.f32 %f211, %f210, %f393, %f390;
231
+ fma.rn.ftz.f32 %f212, %f211, %f393, %f389;
232
+ fma.rn.ftz.f32 %f213, %f212, %f393, %f388;
233
+ fma.rn.ftz.f32 %f214, %f213, %f393, %f387;
234
+ fma.rn.ftz.f32 %f215, %f214, %f393, %f386;
235
+ neg.f32 %f216, %f393;
236
+ selp.f32 %f217, %f216, %f12, %p11;
237
+ fma.rn.ftz.f32 %f394, %f215, %f217, %f217;
238
+ @%p12 bra $L__BB0_16;
239
+ ex2.approx.ftz.f32 %f218, %f394;
240
+ sub.f32 %f220, %f358, %f218;
241
+ mov.b32 %r31, %f220;
242
+ mov.b32 %r32, %f12;
243
+ and.b32 %r33, %r32, -2147483648;
244
+ or.b32 %r34, %r33, %r31;
245
+ mov.b32 %f394, %r34;
246
+ $L__BB0_16:
247
+ .loc 1 0 0
248
+ cvt.u16.u32 %rs7, %r5;
249
+ mov.b32 %f6, %r11;
250
+ .loc 1 30 23
251
+ abs.ftz.f32 %f69, %f13;
252
+ setp.ge.f32 %p14, %f69, 0f3F8060FE;
253
+ mov.f32 %f401, 0f3789CA3C;
254
+ mov.f32 %f400, 0fB9F560B9;
255
+ mov.f32 %f399, 0f3BAC840B;
256
+ mov.f32 %f398, 0fBD0C8162;
257
+ mov.f32 %f397, 0f3E1CF906;
258
+ mov.f32 %f396, 0f3F6A937E;
259
+ mov.f32 %f395, 0f3F20D842;
260
+ mov.f32 %f402, %f69;
261
+ @%p14 bra $L__BB0_18;
262
+ mul.f32 %f402, %f13, %f13;
263
+ mov.f32 %f401, 0f38B1E96A;
264
+ mov.f32 %f400, 0fBA574D20;
265
+ mov.f32 %f399, 0f3BAAD5EA;
266
+ mov.f32 %f398, 0fBCDC1BE7;
267
+ mov.f32 %f397, 0f3DE718AF;
268
+ mov.f32 %f396, 0fBEC093AC;
269
+ mov.f32 %f395, 0f3E0375D3;
270
+ $L__BB0_18:
271
+ .loc 1 0 0
272
+ cvt.f32.bf16 %r12, %rs7;
273
+ mul.f32 %f14, %f6, 0f3F3504F3;
274
+ .loc 1 30 23
275
+ setp.ltu.f32 %p15, %f69, 0f3F8060FE;
276
+ fma.rn.ftz.f32 %f235, %f401, %f402, %f400;
277
+ fma.rn.ftz.f32 %f236, %f235, %f402, %f399;
278
+ fma.rn.ftz.f32 %f237, %f236, %f402, %f398;
279
+ fma.rn.ftz.f32 %f238, %f237, %f402, %f397;
280
+ fma.rn.ftz.f32 %f239, %f238, %f402, %f396;
281
+ fma.rn.ftz.f32 %f240, %f239, %f402, %f395;
282
+ neg.f32 %f241, %f402;
283
+ selp.f32 %f242, %f241, %f13, %p14;
284
+ fma.rn.ftz.f32 %f403, %f240, %f242, %f242;
285
+ @%p15 bra $L__BB0_20;
286
+ ex2.approx.ftz.f32 %f243, %f403;
287
+ sub.f32 %f245, %f358, %f243;
288
+ mov.b32 %r35, %f245;
289
+ mov.b32 %r36, %f13;
290
+ and.b32 %r37, %r36, -2147483648;
291
+ or.b32 %r38, %r37, %r35;
292
+ mov.b32 %f403, %r38;
293
+ $L__BB0_20:
294
+ .loc 1 0 0
295
+ { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; }
296
+ mov.b32 %f7, %r12;
297
+ .loc 1 30 23
298
+ abs.ftz.f32 %f82, %f14;
299
+ setp.ge.f32 %p17, %f82, 0f3F8060FE;
300
+ mov.f32 %f410, 0f3789CA3C;
301
+ mov.f32 %f409, 0fB9F560B9;
302
+ mov.f32 %f408, 0f3BAC840B;
303
+ mov.f32 %f407, 0fBD0C8162;
304
+ mov.f32 %f406, 0f3E1CF906;
305
+ mov.f32 %f405, 0f3F6A937E;
306
+ mov.f32 %f404, 0f3F20D842;
307
+ mov.f32 %f411, %f82;
308
+ @%p17 bra $L__BB0_22;
309
+ mul.f32 %f411, %f14, %f14;
310
+ mov.f32 %f410, 0f38B1E96A;
311
+ mov.f32 %f409, 0fBA574D20;
312
+ mov.f32 %f408, 0f3BAAD5EA;
313
+ mov.f32 %f407, 0fBCDC1BE7;
314
+ mov.f32 %f406, 0f3DE718AF;
315
+ mov.f32 %f405, 0fBEC093AC;
316
+ mov.f32 %f404, 0f3E0375D3;
317
+ $L__BB0_22:
318
+ .loc 1 0 0
319
+ cvt.f32.bf16 %r13, %rs8;
320
+ mul.f32 %f15, %f7, 0f3F3504F3;
321
+ .loc 1 30 23
322
+ setp.ltu.f32 %p18, %f82, 0f3F8060FE;
323
+ fma.rn.ftz.f32 %f260, %f410, %f411, %f409;
324
+ fma.rn.ftz.f32 %f261, %f260, %f411, %f408;
325
+ fma.rn.ftz.f32 %f262, %f261, %f411, %f407;
326
+ fma.rn.ftz.f32 %f263, %f262, %f411, %f406;
327
+ fma.rn.ftz.f32 %f264, %f263, %f411, %f405;
328
+ fma.rn.ftz.f32 %f265, %f264, %f411, %f404;
329
+ neg.f32 %f266, %f411;
330
+ selp.f32 %f267, %f266, %f14, %p17;
331
+ fma.rn.ftz.f32 %f412, %f265, %f267, %f267;
332
+ @%p18 bra $L__BB0_24;
333
+ ex2.approx.ftz.f32 %f268, %f412;
334
+ sub.f32 %f270, %f358, %f268;
335
+ mov.b32 %r39, %f270;
336
+ mov.b32 %r40, %f14;
337
+ and.b32 %r41, %r40, -2147483648;
338
+ or.b32 %r42, %r41, %r39;
339
+ mov.b32 %f412, %r42;
340
+ $L__BB0_24:
341
+ .loc 1 0 0
342
+ mov.b32 %f8, %r13;
343
+ .loc 1 30 23
344
+ abs.ftz.f32 %f95, %f15;
345
+ setp.ge.f32 %p20, %f95, 0f3F8060FE;
346
+ mov.f32 %f419, 0f3789CA3C;
347
+ mov.f32 %f418, 0fB9F560B9;
348
+ mov.f32 %f417, 0f3BAC840B;
349
+ mov.f32 %f416, 0fBD0C8162;
350
+ mov.f32 %f415, 0f3E1CF906;
351
+ mov.f32 %f414, 0f3F6A937E;
352
+ mov.f32 %f413, 0f3F20D842;
353
+ mov.f32 %f420, %f95;
354
+ @%p20 bra $L__BB0_26;
355
+ mul.f32 %f420, %f15, %f15;
356
+ mov.f32 %f419, 0f38B1E96A;
357
+ mov.f32 %f418, 0fBA574D20;
358
+ mov.f32 %f417, 0f3BAAD5EA;
359
+ mov.f32 %f416, 0fBCDC1BE7;
360
+ mov.f32 %f415, 0f3DE718AF;
361
+ mov.f32 %f414, 0fBEC093AC;
362
+ mov.f32 %f413, 0f3E0375D3;
363
+ $L__BB0_26:
364
+ .loc 1 0 0
365
+ mul.f32 %f16, %f8, 0f3F3504F3;
366
+ .loc 1 30 23
367
+ setp.ltu.f32 %p21, %f95, 0f3F8060FE;
368
+ fma.rn.ftz.f32 %f285, %f419, %f420, %f418;
369
+ fma.rn.ftz.f32 %f286, %f285, %f420, %f417;
370
+ fma.rn.ftz.f32 %f287, %f286, %f420, %f416;
371
+ fma.rn.ftz.f32 %f288, %f287, %f420, %f415;
372
+ fma.rn.ftz.f32 %f289, %f288, %f420, %f414;
373
+ fma.rn.ftz.f32 %f290, %f289, %f420, %f413;
374
+ neg.f32 %f291, %f420;
375
+ selp.f32 %f292, %f291, %f15, %p20;
376
+ fma.rn.ftz.f32 %f421, %f290, %f292, %f292;
377
+ @%p21 bra $L__BB0_28;
378
+ ex2.approx.ftz.f32 %f293, %f421;
379
+ sub.f32 %f295, %f358, %f293;
380
+ mov.b32 %r43, %f295;
381
+ mov.b32 %r44, %f15;
382
+ and.b32 %r45, %r44, -2147483648;
383
+ or.b32 %r46, %r45, %r43;
384
+ mov.b32 %f421, %r46;
385
+ $L__BB0_28:
386
+ abs.ftz.f32 %f108, %f16;
387
+ setp.ge.f32 %p23, %f108, 0f3F8060FE;
388
+ mov.f32 %f428, 0f3789CA3C;
389
+ mov.f32 %f427, 0fB9F560B9;
390
+ mov.f32 %f426, 0f3BAC840B;
391
+ mov.f32 %f425, 0fBD0C8162;
392
+ mov.f32 %f424, 0f3E1CF906;
393
+ mov.f32 %f423, 0f3F6A937E;
394
+ mov.f32 %f422, 0f3F20D842;
395
+ mov.f32 %f429, %f108;
396
+ @%p23 bra $L__BB0_30;
397
+ mul.f32 %f429, %f16, %f16;
398
+ mov.f32 %f428, 0f38B1E96A;
399
+ mov.f32 %f427, 0fBA574D20;
400
+ mov.f32 %f426, 0f3BAAD5EA;
401
+ mov.f32 %f425, 0fBCDC1BE7;
402
+ mov.f32 %f424, 0f3DE718AF;
403
+ mov.f32 %f423, 0fBEC093AC;
404
+ mov.f32 %f422, 0f3E0375D3;
405
+ $L__BB0_30:
406
+ setp.ltu.f32 %p24, %f108, 0f3F8060FE;
407
+ fma.rn.ftz.f32 %f310, %f428, %f429, %f427;
408
+ fma.rn.ftz.f32 %f311, %f310, %f429, %f426;
409
+ fma.rn.ftz.f32 %f312, %f311, %f429, %f425;
410
+ fma.rn.ftz.f32 %f313, %f312, %f429, %f424;
411
+ fma.rn.ftz.f32 %f314, %f313, %f429, %f423;
412
+ fma.rn.ftz.f32 %f315, %f314, %f429, %f422;
413
+ neg.f32 %f316, %f429;
414
+ selp.f32 %f317, %f316, %f16, %p23;
415
+ fma.rn.ftz.f32 %f430, %f315, %f317, %f317;
416
+ @%p24 bra $L__BB0_32;
417
+ ex2.approx.ftz.f32 %f318, %f430;
418
+ sub.f32 %f320, %f358, %f318;
419
+ mov.b32 %r47, %f320;
420
+ mov.b32 %r48, %f16;
421
+ and.b32 %r49, %r48, -2147483648;
422
+ or.b32 %r50, %r49, %r47;
423
+ mov.b32 %f430, %r50;
424
+ $L__BB0_32:
425
+ .loc 1 27 18
426
+ mul.f32 %f321, %f8, 0f3F000000;
427
+ mul.f32 %f322, %f7, 0f3F000000;
428
+ mul.f32 %f323, %f6, 0f3F000000;
429
+ mul.f32 %f324, %f5, 0f3F000000;
430
+ mul.f32 %f325, %f4, 0f3F000000;
431
+ mul.f32 %f326, %f3, 0f3F000000;
432
+ mul.f32 %f327, %f2, 0f3F000000;
433
+ mul.f32 %f328, %f1, 0f3F000000;
434
+ .loc 1 32 18
435
+ add.f32 %f329, %f367, 0f3F800000;
436
+ add.f32 %f330, %f376, 0f3F800000;
437
+ add.f32 %f331, %f385, 0f3F800000;
438
+ add.f32 %f332, %f394, 0f3F800000;
439
+ add.f32 %f333, %f403, 0f3F800000;
440
+ add.f32 %f334, %f412, 0f3F800000;
441
+ add.f32 %f335, %f421, 0f3F800000;
442
+ add.f32 %f336, %f430, 0f3F800000;
443
+ .loc 1 33 18
444
+ mul.f32 %f337, %f328, %f329;
445
+ mul.f32 %f338, %f327, %f330;
446
+ mul.f32 %f339, %f326, %f331;
447
+ mul.f32 %f340, %f325, %f332;
448
+ mul.f32 %f341, %f324, %f333;
449
+ mul.f32 %f342, %f323, %f334;
450
+ mul.f32 %f343, %f322, %f335;
451
+ mul.f32 %f344, %f321, %f336;
452
+ .loc 1 35 40
453
+ mov.b32 %r51, %f337;
454
+ cvt.rn.bf16.f32 %rs9, %r51;
455
+ mov.b32 %r52, %f338;
456
+ cvt.rn.bf16.f32 %rs10, %r52;
457
+ mov.b32 %r53, %f339;
458
+ cvt.rn.bf16.f32 %rs11, %r53;
459
+ mov.b32 %r54, %f340;
460
+ cvt.rn.bf16.f32 %rs12, %r54;
461
+ mov.b32 %r55, %f341;
462
+ cvt.rn.bf16.f32 %rs13, %r55;
463
+ mov.b32 %r56, %f342;
464
+ cvt.rn.bf16.f32 %rs14, %r56;
465
+ mov.b32 %r57, %f343;
466
+ cvt.rn.bf16.f32 %rs15, %r57;
467
+ mov.b32 %r58, %f344;
468
+ cvt.rn.bf16.f32 %rs16, %r58;
469
+ mov.b32 %r63, {%rs9, %rs10};
470
+ mov.b32 %r64, {%rs11, %rs12};
471
+ mov.b32 %r65, {%rs13, %rs14};
472
+ mov.b32 %r66, {%rs15, %rs16};
473
+ @%p1 st.global.v4.b32 [ %rd5 + 0 ], { %r63, %r64, %r65, %r66 };
474
+ .loc 1 35 4
475
+ ret;
476
+ $L__tmp1:
477
+ $L__func_end0:
478
+
479
+ }
480
+ // .globl __nv_erff
481
+ .visible .func (.param .b32 func_retval0) __nv_erff(
482
+ .param .b32 __nv_erff_param_0
483
+ )
484
+ {
485
+ .reg .pred %p<4>;
486
+ .reg .b32 %r<5>;
487
+ .reg .f32 %f<49>;
488
+ $L__func_begin1:
489
+
490
+ ld.param.f32 %f14, [__nv_erff_param_0];
491
+ abs.ftz.f32 %f1, %f14;
492
+ setp.ge.f32 %p1, %f1, 0f3F8060FE;
493
+ mov.f32 %f46, 0f3789CA3C;
494
+ mov.f32 %f45, 0fB9F560B9;
495
+ mov.f32 %f44, 0f3BAC840B;
496
+ mov.f32 %f43, 0fBD0C8162;
497
+ mov.f32 %f42, 0f3E1CF906;
498
+ mov.f32 %f41, 0f3F6A937E;
499
+ mov.f32 %f40, 0f3F20D842;
500
+ mov.f32 %f47, %f1;
501
+ @%p1 bra $L__BB1_2;
502
+ mul.f32 %f47, %f14, %f14;
503
+ mov.f32 %f46, 0f38B1E96A;
504
+ mov.f32 %f45, 0fBA574D20;
505
+ mov.f32 %f44, 0f3BAAD5EA;
506
+ mov.f32 %f43, 0fBCDC1BE7;
507
+ mov.f32 %f42, 0f3DE718AF;
508
+ mov.f32 %f41, 0fBEC093AC;
509
+ mov.f32 %f40, 0f3E0375D3;
510
+ $L__BB1_2:
511
+ setp.ltu.f32 %p2, %f1, 0f3F8060FE;
512
+ fma.rn.ftz.f32 %f29, %f46, %f47, %f45;
513
+ fma.rn.ftz.f32 %f30, %f29, %f47, %f44;
514
+ fma.rn.ftz.f32 %f31, %f30, %f47, %f43;
515
+ fma.rn.ftz.f32 %f32, %f31, %f47, %f42;
516
+ fma.rn.ftz.f32 %f33, %f32, %f47, %f41;
517
+ fma.rn.ftz.f32 %f34, %f33, %f47, %f40;
518
+ neg.f32 %f35, %f47;
519
+ selp.f32 %f36, %f35, %f14, %p1;
520
+ fma.rn.ftz.f32 %f48, %f34, %f36, %f36;
521
+ @%p2 bra $L__BB1_4;
522
+ ex2.approx.ftz.f32 %f37, %f48;
523
+ mov.f32 %f38, 0f3F800000;
524
+ sub.f32 %f39, %f38, %f37;
525
+ mov.b32 %r1, %f39;
526
+ mov.b32 %r2, %f14;
527
+ and.b32 %r3, %r2, -2147483648;
528
+ or.b32 %r4, %r3, %r1;
529
+ mov.b32 %f48, %r4;
530
+ $L__BB1_4:
531
+ st.param.f32 [func_retval0+0], %f48;
532
+ ret;
533
+ $L__func_end1:
534
+
535
+ }
536
+ .file 1 "/tmp/torchinductor_root/kp/ckphrtdpgsxl7sfarkkzylhv4st3uhmzvg3u6z5excfp6ydybq74.py"
537
+ .section .debug_abbrev
538
+ {
539
+ .b8 1
540
+ .b8 17
541
+ .b8 1
542
+ .b8 37
543
+ .b8 8
544
+ .b8 19
545
+ .b8 5
546
+ .b8 3
547
+ .b8 8
548
+ .b8 16
549
+ .b8 6
550
+ .b8 27
551
+ .b8 8
552
+ .b8 180
553
+ .b8 66
554
+ .b8 12
555
+ .b8 17
556
+ .b8 1
557
+ .b8 18
558
+ .b8 1
559
+ .b8 0
560
+ .b8 0
561
+ .b8 2
562
+ .b8 46
563
+ .b8 0
564
+ .b8 17
565
+ .b8 1
566
+ .b8 18
567
+ .b8 1
568
+ .b8 64
569
+ .b8 10
570
+ .b8 135
571
+ .b8 64
572
+ .b8 8
573
+ .b8 3
574
+ .b8 8
575
+ .b8 58
576
+ .b8 11
577
+ .b8 59
578
+ .b8 11
579
+ .b8 63
580
+ .b8 12
581
+ .b8 0
582
+ .b8 0
583
+ .b8 0
584
+ }
585
+ .section .debug_info
586
+ {
587
+ .b32 172
588
+ .b8 2
589
+ .b8 0
590
+ .b32 .debug_abbrev
591
+ .b8 8
592
+ .b8 1
593
+ .b8 116
594
+ .b8 114
595
+ .b8 105
596
+ .b8 116
597
+ .b8 111
598
+ .b8 110
599
+ .b8 0
600
+ .b8 2
601
+ .b8 0
602
+ .b8 99
603
+ .b8 107
604
+ .b8 112
605
+ .b8 104
606
+ .b8 114
607
+ .b8 116
608
+ .b8 100
609
+ .b8 112
610
+ .b8 103
611
+ .b8 115
612
+ .b8 120
613
+ .b8 108
614
+ .b8 55
615
+ .b8 115
616
+ .b8 102
617
+ .b8 97
618
+ .b8 114
619
+ .b8 107
620
+ .b8 107
621
+ .b8 122
622
+ .b8 121
623
+ .b8 108
624
+ .b8 104
625
+ .b8 118
626
+ .b8 52
627
+ .b8 115
628
+ .b8 116
629
+ .b8 51
630
+ .b8 117
631
+ .b8 104
632
+ .b8 109
633
+ .b8 122
634
+ .b8 118
635
+ .b8 103
636
+ .b8 51
637
+ .b8 117
638
+ .b8 54
639
+ .b8 122
640
+ .b8 53
641
+ .b8 101
642
+ .b8 120
643
+ .b8 99
644
+ .b8 102
645
+ .b8 112
646
+ .b8 54
647
+ .b8 121
648
+ .b8 100
649
+ .b8 121
650
+ .b8 98
651
+ .b8 113
652
+ .b8 55
653
+ .b8 52
654
+ .b8 46
655
+ .b8 112
656
+ .b8 121
657
+ .b8 0
658
+ .b32 .debug_line
659
+ .b8 47
660
+ .b8 116
661
+ .b8 109
662
+ .b8 112
663
+ .b8 47
664
+ .b8 116
665
+ .b8 111
666
+ .b8 114
667
+ .b8 99
668
+ .b8 104
669
+ .b8 105
670
+ .b8 110
671
+ .b8 100
672
+ .b8 117
673
+ .b8 99
674
+ .b8 116
675
+ .b8 111
676
+ .b8 114
677
+ .b8 95
678
+ .b8 114
679
+ .b8 111
680
+ .b8 111
681
+ .b8 116
682
+ .b8 47
683
+ .b8 107
684
+ .b8 112
685
+ .b8 0
686
+ .b8 1
687
+ .b64 $L__func_begin0
688
+ .b64 $L__func_end0
689
+ .b8 2
690
+ .b64 $L__func_begin0
691
+ .b64 $L__func_end0
692
+ .b8 1
693
+ .b8 156
694
+ .b8 116
695
+ .b8 114
696
+ .b8 105
697
+ .b8 116
698
+ .b8 111
699
+ .b8 110
700
+ .b8 95
701
+ .b8 95
702
+ .b8 48
703
+ .b8 100
704
+ .b8 49
705
+ .b8 100
706
+ .b8 101
707
+ .b8 0
708
+ .b8 116
709
+ .b8 114
710
+ .b8 105
711
+ .b8 116
712
+ .b8 111
713
+ .b8 110
714
+ .b8 95
715
+ .b8 95
716
+ .b8 48
717
+ .b8 100
718
+ .b8 49
719
+ .b8 100
720
+ .b8 101
721
+ .b8 0
722
+ .b8 1
723
+ .b8 18
724
+ .b8 1
725
+ .b8 0
726
+ }
727
+ .section .debug_pubnames
728
+ {
729
+ .b32 $L__pubNames_end0-$L__pubNames_start0
730
+ $L__pubNames_start0:
731
+ .b8 2
732
+ .b8 0
733
+ .b32 .debug_info
734
+ .b32 176
735
+ .b32 125
736
+ .b8 116
737
+ .b8 114
738
+ .b8 105
739
+ .b8 116
740
+ .b8 111
741
+ .b8 110
742
+ .b8 95
743
+ .b8 95
744
+ .b8 48
745
+ .b8 100
746
+ .b8 49
747
+ .b8 100
748
+ .b8 101
749
+ .b8 0
750
+ .b32 0
751
+ $L__pubNames_end0:
752
+ }
753
+ .section .debug_pubtypes
754
+ {
755
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
756
+ $L__pubTypes_start0:
757
+ .b8 2
758
+ .b8 0
759
+ .b32 .debug_info
760
+ .b32 176
761
+ .b32 0
762
+ $L__pubTypes_end0:
763
+ }
764
+ .section .debug_loc { }
.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttgir ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<1.000000e+00> : tensor<1024xf32, #blocked>
5
+ %cst_0 = arith.constant dense<0.707106769> : tensor<1024xf32, #blocked>
6
+ %cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32, #blocked>
7
+ %c1024_i32 = arith.constant 1024 : i32
8
+ %0 = tt.get_program_id x : i32
9
+ %1 = arith.muli %0, %c1024_i32 : i32
10
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
11
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
12
+ %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
13
+ %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
14
+ %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
15
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
16
+ %8 = arith.extf %7 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked>
17
+ %9 = arith.mulf %8, %cst_1 : tensor<1024xf32, #blocked>
18
+ %10 = arith.mulf %8, %cst_0 : tensor<1024xf32, #blocked>
19
+ %11 = tt.extern_elementwise %10 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32, #blocked>) -> tensor<1024xf32, #blocked>
20
+ %12 = arith.addf %11, %cst : tensor<1024xf32, #blocked>
21
+ %13 = arith.mulf %9, %12 : tensor<1024xf32, #blocked>
22
+ %14 = arith.truncf %13 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked>
23
+ tt.store %6, %14 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked>
24
+ tt.return
25
+ }
26
+ }
.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttir ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<1.000000e+00> : tensor<1024xf32>
4
+ %cst_0 = arith.constant dense<0.707106769> : tensor<1024xf32>
5
+ %cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32>
6
+ %c1024_i32 = arith.constant 1024 : i32
7
+ %0 = tt.get_program_id x : i32
8
+ %1 = arith.muli %0, %c1024_i32 : i32
9
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
10
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
11
+ %4 = arith.addi %3, %2 : tensor<1024xi32>
12
+ %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
13
+ %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
14
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
15
+ %8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32>
16
+ %9 = arith.mulf %8, %cst_1 : tensor<1024xf32>
17
+ %10 = arith.mulf %8, %cst_0 : tensor<1024xf32>
18
+ %11 = tt.extern_elementwise %10 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32>) -> tensor<1024xf32>
19
+ %12 = arith.addf %11, %cst : tensor<1024xf32>
20
+ %13 = arith.mulf %9, %12 : tensor<1024xf32>
21
+ %14 = arith.truncf %13 : tensor<1024xf32> to tensor<1024xbf16>
22
+ tt.store %6, %14 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16>
23
+ tt.return
24
+ }
25
+ }
.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.cubin ADDED
Binary file (14.6 kB). View file
 
.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttgir ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
5
+ %cst = arith.constant dense<256> : tensor<32x1xi64, #blocked>
6
+ %cst_0 = arith.constant dense<0> : tensor<32x1xi64, #blocked>
7
+ %cst_1 = arith.constant dense<512> : tensor<32x1xi64, #blocked>
8
+ %cst_2 = arith.constant dense<256> : tensor<32x1xi32, #blocked>
9
+ %cst_3 = arith.constant dense<131072> : tensor<1x128xi32, #blocked1>
10
+ %cst_4 = arith.constant dense<120> : tensor<1x128xi32, #blocked1>
11
+ %cst_5 = arith.constant dense<0.000000e+00> : tensor<32x128xf32, #blocked1>
12
+ %cst_6 = arith.constant dense<true> : tensor<32x1xi1, #blocked>
13
+ %c32_i32 = arith.constant 32 : i32
14
+ %0 = tt.get_program_id x : i32
15
+ %1 = arith.muli %0, %c32_i32 : i32
16
+ %2 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
17
+ %3 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
18
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<32x1xi32, #blocked1>
19
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<32x1xi32, #blocked>
20
+ %6 = tt.splat %1 : (i32) -> tensor<32x1xi32, #blocked1>
21
+ %7 = tt.splat %1 : (i32) -> tensor<32x1xi32, #blocked>
22
+ %8 = arith.addi %6, %4 : tensor<32x1xi32, #blocked1>
23
+ %9 = arith.addi %7, %5 : tensor<32x1xi32, #blocked>
24
+ %10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
25
+ %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x128xi32, #blocked1>
26
+ %12 = arith.cmpi slt, %11, %cst_4 : tensor<1x128xi32, #blocked1>
27
+ %13 = arith.muli %11, %cst_3 : tensor<1x128xi32, #blocked1>
28
+ %14 = tt.broadcast %8 : (tensor<32x1xi32, #blocked1>) -> tensor<32x128xi32, #blocked1>
29
+ %15 = tt.broadcast %13 : (tensor<1x128xi32, #blocked1>) -> tensor<32x128xi32, #blocked1>
30
+ %16 = arith.addi %14, %15 : tensor<32x128xi32, #blocked1>
31
+ %17 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<32x128x!tt.ptr<f32, 1>, #blocked1>
32
+ %18 = tt.addptr %17, %16 : tensor<32x128x!tt.ptr<f32, 1>, #blocked1>, tensor<32x128xi32, #blocked1>
33
+ %19 = tt.broadcast %12 : (tensor<1x128xi1, #blocked1>) -> tensor<32x128xi1, #blocked1>
34
+ %20 = tt.load %18, %19, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<32x128xf32, #blocked1>
35
+ %21 = arith.addf %20, %cst_5 : tensor<32x128xf32, #blocked1>
36
+ %22 = arith.select %19, %21, %cst_5 : tensor<32x128xi1, #blocked1>, tensor<32x128xf32, #blocked1>
37
+ %23 = "tt.reduce"(%22) <{axis = 1 : i32}> ({
38
+ ^bb0(%arg5: f32, %arg6: f32):
39
+ %40 = arith.addf %arg5, %arg6 : f32
40
+ tt.reduce.return %40 : f32
41
+ }) : (tensor<32x128xf32, #blocked1>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
42
+ %24 = triton_gpu.convert_layout %23 : (tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
43
+ %25 = tt.expand_dims %24 {axis = 1 : i32} : (tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<32x1xf32, #blocked>
44
+ %26 = arith.divsi %9, %cst_2 : tensor<32x1xi32, #blocked>
45
+ %27 = arith.remsi %9, %cst_2 : tensor<32x1xi32, #blocked>
46
+ %28 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<32x1x!tt.ptr<i64, 1>, #blocked>
47
+ %29 = tt.addptr %28, %26 : tensor<32x1x!tt.ptr<i64, 1>, #blocked>, tensor<32x1xi32, #blocked>
48
+ %30 = tt.load %29 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<32x1xi64, #blocked>
49
+ %31 = arith.addi %30, %cst_1 : tensor<32x1xi64, #blocked>
50
+ %32 = arith.cmpi slt, %30, %cst_0 : tensor<32x1xi64, #blocked>
51
+ %33 = arith.select %32, %31, %30 : tensor<32x1xi1, #blocked>, tensor<32x1xi64, #blocked>
52
+ %34 = arith.muli %33, %cst : tensor<32x1xi64, #blocked>
53
+ %35 = arith.extsi %27 : tensor<32x1xi32, #blocked> to tensor<32x1xi64, #blocked>
54
+ %36 = arith.addi %35, %34 : tensor<32x1xi64, #blocked>
55
+ %37 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<32x1x!tt.ptr<f32, 1>, #blocked>
56
+ %38 = tt.addptr %37, %36 : tensor<32x1x!tt.ptr<f32, 1>, #blocked>, tensor<32x1xi64, #blocked>
57
+ %39 = "tt.atomic_rmw"(%38, %25, %cst_6) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<32x1x!tt.ptr<f32, 1>, #blocked>, tensor<32x1xf32, #blocked>, tensor<32x1xi1, #blocked>) -> tensor<32x1xf32, #blocked>
58
+ tt.return
59
+ }
60
+ }
.triton/dump/199215289adb100508718a5a762ba4d7/triton_.llir ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
5
+ @assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
6
+ @assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp7 < 50257"
7
+ @global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
8
+
9
+ declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
10
+
11
+ define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2) local_unnamed_addr !dbg !7 {
12
+ %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
13
+ %5 = and i32 %4, 127, !dbg !10
14
+ %6 = shl nuw nsw i32 %5, 1, !dbg !10
15
+ %7 = or i32 %6, 1, !dbg !10
16
+ %8 = or i32 %6, 256, !dbg !10
17
+ %9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !11
18
+ %10 = sext i32 %9 to i64, !dbg !12
19
+ %11 = shl nsw i64 %10, 9, !dbg !13
20
+ %12 = zext nneg i32 %6 to i64
21
+ %13 = zext nneg i32 %8 to i64
22
+ %14 = or i64 %11, %12, !dbg !14
23
+ %15 = or i64 %11, %13, !dbg !14
24
+ %16 = getelementptr i64, ptr addrspace(1) %0, i64 %14, !dbg !15
25
+ %17 = getelementptr i64, ptr addrspace(1) %0, i64 %15, !dbg !15
26
+ %18 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %16, i1 true) #2, !dbg !16
27
+ %19 = extractvalue { i64, i64 } %18, 0, !dbg !16
28
+ %20 = extractvalue { i64, i64 } %18, 1, !dbg !16
29
+ %21 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %17, i1 true) #2, !dbg !16
30
+ %22 = extractvalue { i64, i64 } %21, 0, !dbg !16
31
+ %23 = extractvalue { i64, i64 } %21, 1, !dbg !16
32
+ %24 = insertelement <4 x i64> poison, i64 %23, i64 0, !dbg !17
33
+ %25 = insertelement <4 x i64> %24, i64 %22, i64 1, !dbg !17
34
+ %26 = insertelement <4 x i64> %25, i64 %20, i64 2, !dbg !17
35
+ %27 = insertelement <4 x i64> %26, i64 %19, i64 3, !dbg !17
36
+ %28 = icmp eq <4 x i64> %27, <i64 -1, i64 -1, i64 -1, i64 -1>, !dbg !17
37
+ %29 = select <4 x i1> %28, <4 x i64> zeroinitializer, <4 x i64> %27, !dbg !18
38
+ %30 = add <4 x i64> %29, <i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !19
39
+ %31 = icmp slt <4 x i64> %29, zeroinitializer, !dbg !20
40
+ %32 = select <4 x i1> %31, <4 x i64> %30, <4 x i64> %29, !dbg !21
41
+ %33 = icmp ult <4 x i64> %32, <i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !22
42
+ %34 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %12, !dbg !22
43
+ %35 = extractelement <4 x i1> %33, i64 3, !dbg !22
44
+ %36 = zext i1 %35 to i8, !dbg !22
45
+ %37 = insertelement <1 x i8> undef, i8 %36, i64 0, !dbg !22
46
+ store <1 x i8> %37, ptr addrspace(3) %34, align 1, !dbg !22
47
+ %38 = zext nneg i32 %7 to i64, !dbg !22
48
+ %39 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %38, !dbg !22
49
+ %40 = extractelement <4 x i1> %33, i64 2, !dbg !22
50
+ %41 = zext i1 %40 to i8, !dbg !22
51
+ %42 = insertelement <1 x i8> undef, i8 %41, i64 0, !dbg !22
52
+ store <1 x i8> %42, ptr addrspace(3) %39, align 1, !dbg !22
53
+ tail call void @llvm.nvvm.barrier0(), !dbg !22
54
+ %43 = zext nneg i32 %5 to i64, !dbg !22
55
+ %44 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %43, !dbg !22
56
+ %45 = load i8, ptr addrspace(3) %44, align 1, !dbg !22
57
+ %46 = or i32 %5, 128, !dbg !22
58
+ %47 = zext nneg i32 %46 to i64, !dbg !22
59
+ %48 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %47, !dbg !22
60
+ %49 = load i8, ptr addrspace(3) %48, align 1, !dbg !22
61
+ tail call void @llvm.nvvm.barrier0(), !dbg !22
62
+ %50 = extractelement <4 x i1> %33, i64 1, !dbg !22
63
+ %51 = zext i1 %50 to i8, !dbg !22
64
+ %52 = insertelement <1 x i8> undef, i8 %51, i64 0, !dbg !22
65
+ store <1 x i8> %52, ptr addrspace(3) %34, align 1, !dbg !22
66
+ %53 = extractelement <4 x i1> %33, i64 0, !dbg !22
67
+ %54 = zext i1 %53 to i8, !dbg !22
68
+ %55 = insertelement <1 x i8> undef, i8 %54, i64 0, !dbg !22
69
+ store <1 x i8> %55, ptr addrspace(3) %39, align 1, !dbg !22
70
+ tail call void @llvm.nvvm.barrier0(), !dbg !22
71
+ %56 = load i8, ptr addrspace(3) %44, align 1, !dbg !22
72
+ %57 = load i8, ptr addrspace(3) %48, align 1, !dbg !22
73
+ %58 = insertelement <4 x i8> poison, i8 %49, i64 0, !dbg !22
74
+ %59 = insertelement <4 x i8> %58, i8 %45, i64 1, !dbg !22
75
+ %60 = insertelement <4 x i8> %59, i8 %56, i64 2, !dbg !22
76
+ %61 = insertelement <4 x i8> %60, i8 %57, i64 3, !dbg !22
77
+ %62 = icmp eq <4 x i8> %61, zeroinitializer, !dbg !22
78
+ %63 = bitcast <4 x i1> %62 to i4, !dbg !23
79
+ %.not = icmp eq i4 %63, 0, !dbg !23
80
+ br i1 %.not, label %65, label %64, !dbg !23
81
+
82
+ 64: ; preds = %3
83
+ tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !23
84
+ br label %65, !dbg !23
85
+
86
+ 65: ; preds = %64, %3
87
+ %66 = or i32 %6, 257, !dbg !10
88
+ %67 = zext nneg i32 %66 to i64
89
+ %68 = or i64 %11, %67, !dbg !14
90
+ %69 = or i64 %11, %38, !dbg !14
91
+ %70 = mul nsw i64 %14, 50257, !dbg !24
92
+ %71 = mul nsw i64 %69, 50257, !dbg !24
93
+ %72 = mul nsw i64 %15, 50257, !dbg !24
94
+ %73 = mul nsw i64 %68, 50257, !dbg !24
95
+ %74 = extractelement <4 x i64> %32, i64 3, !dbg !25
96
+ %75 = getelementptr float, ptr addrspace(1) %1, i64 %74, !dbg !25
97
+ %76 = getelementptr float, ptr addrspace(1) %75, i64 %70, !dbg !25
98
+ %77 = extractelement <4 x i64> %32, i64 2, !dbg !25
99
+ %78 = getelementptr float, ptr addrspace(1) %1, i64 %77, !dbg !25
100
+ %79 = getelementptr float, ptr addrspace(1) %78, i64 %71, !dbg !25
101
+ %80 = extractelement <4 x i64> %32, i64 1, !dbg !25
102
+ %81 = getelementptr float, ptr addrspace(1) %1, i64 %80, !dbg !25
103
+ %82 = getelementptr float, ptr addrspace(1) %81, i64 %72, !dbg !25
104
+ %83 = extractelement <4 x i64> %32, i64 0, !dbg !25
105
+ %84 = getelementptr float, ptr addrspace(1) %1, i64 %83, !dbg !25
106
+ %85 = getelementptr float, ptr addrspace(1) %84, i64 %73, !dbg !25
107
+ tail call void @llvm.nvvm.barrier0(), !dbg !26
108
+ %86 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %12, !dbg !26
109
+ %87 = ptrtoint ptr addrspace(1) %76 to i64, !dbg !26
110
+ %88 = insertelement <1 x i64> undef, i64 %87, i64 0, !dbg !26
111
+ store <1 x i64> %88, ptr addrspace(3) %86, align 8, !dbg !26
112
+ %89 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %38, !dbg !26
113
+ %90 = ptrtoint ptr addrspace(1) %79 to i64, !dbg !26
114
+ %91 = insertelement <1 x i64> undef, i64 %90, i64 0, !dbg !26
115
+ store <1 x i64> %91, ptr addrspace(3) %89, align 8, !dbg !26
116
+ tail call void @llvm.nvvm.barrier0(), !dbg !26
117
+ %92 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %43, !dbg !26
118
+ %93 = load i64, ptr addrspace(3) %92, align 8, !dbg !26
119
+ %94 = inttoptr i64 %93 to ptr addrspace(1), !dbg !26
120
+ %95 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %47, !dbg !26
121
+ %96 = load i64, ptr addrspace(3) %95, align 8, !dbg !26
122
+ %97 = inttoptr i64 %96 to ptr addrspace(1), !dbg !26
123
+ tail call void @llvm.nvvm.barrier0(), !dbg !26
124
+ %98 = ptrtoint ptr addrspace(1) %82 to i64, !dbg !26
125
+ %99 = insertelement <1 x i64> undef, i64 %98, i64 0, !dbg !26
126
+ store <1 x i64> %99, ptr addrspace(3) %86, align 8, !dbg !26
127
+ %100 = ptrtoint ptr addrspace(1) %85 to i64, !dbg !26
128
+ %101 = insertelement <1 x i64> undef, i64 %100, i64 0, !dbg !26
129
+ store <1 x i64> %101, ptr addrspace(3) %89, align 8, !dbg !26
130
+ tail call void @llvm.nvvm.barrier0(), !dbg !26
131
+ %102 = load i64, ptr addrspace(3) %92, align 8, !dbg !26
132
+ %103 = inttoptr i64 %102 to ptr addrspace(1), !dbg !26
133
+ %104 = load i64, ptr addrspace(3) %95, align 8, !dbg !26
134
+ %105 = inttoptr i64 %104 to ptr addrspace(1), !dbg !26
135
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 -1082130432, ptr addrspace(1) %94, i1 true) #2, !dbg !26
136
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 -1082130432, ptr addrspace(1) %97, i1 true) #2, !dbg !26
137
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 -1082130432, ptr addrspace(1) %103, i1 true) #2, !dbg !26
138
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 -1082130432, ptr addrspace(1) %105, i1 true) #2, !dbg !26
139
+ ret void, !dbg !27
140
+ }
141
+
142
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
143
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
144
+
145
+ ; Function Attrs: convergent nocallback nounwind
146
+ declare void @llvm.nvvm.barrier0() #1
147
+
148
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
149
+ attributes #1 = { convergent nocallback nounwind }
150
+ attributes #2 = { nounwind }
151
+
152
+ !llvm.module.flags = !{!0, !1}
153
+ !llvm.dbg.cu = !{!2}
154
+ !nvvm.annotations = !{!4, !5, !5, !4}
155
+ !llvm.ident = !{!6}
156
+
157
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
158
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
159
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
160
+ !3 = !DIFile(filename: "chlrkgpvvbdizdz7sllquet2j7zhtes6meh6kenrqxov26mswvw7.py", directory: "/tmp/torchinductor_root/hl")
161
+ !4 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
162
+ !5 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
163
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
164
+ !7 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
165
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
166
+ !9 = !{}
167
+ !10 = !DILocation(line: 21, column: 36, scope: !7)
168
+ !11 = !DILocation(line: 20, column: 28, scope: !7)
169
+ !12 = !DILocation(line: 20, column: 34, scope: !7)
170
+ !13 = !DILocation(line: 20, column: 46, scope: !7)
171
+ !14 = !DILocation(line: 21, column: 23, scope: !7)
172
+ !15 = !DILocation(line: 24, column: 30, scope: !7)
173
+ !16 = !DILocation(line: 24, column: 35, scope: !7)
174
+ !17 = !DILocation(line: 26, column: 19, scope: !7)
175
+ !18 = !DILocation(line: 28, column: 32, scope: !7)
176
+ !19 = !DILocation(line: 29, column: 18, scope: !7)
177
+ !20 = !DILocation(line: 30, column: 18, scope: !7)
178
+ !21 = !DILocation(line: 31, column: 32, scope: !7)
179
+ !22 = !DILocation(line: 32, column: 36, scope: !7)
180
+ !23 = !DILocation(line: 32, column: 51, scope: !7)
181
+ !24 = !DILocation(line: 34, column: 39, scope: !7)
182
+ !25 = !DILocation(line: 34, column: 25, scope: !7)
183
+ !26 = !DILocation(line: 34, column: 51, scope: !7)
184
+ !27 = !DILocation(line: 34, column: 4, scope: !7)
.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.cubin ADDED
Binary file (5.54 kB). View file
 
.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ttir ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c1024_i32 = arith.constant 1024 : i32
4
+ %0 = tt.get_program_id x : i32
5
+ %1 = arith.muli %0, %c1024_i32 : i32
6
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
7
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
8
+ %4 = arith.addi %3, %2 : tensor<1024xi32>
9
+ %5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
10
+ %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
11
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32>
12
+ %8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
13
+ %9 = tt.addptr %8, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
14
+ %10 = arith.truncf %7 : tensor<1024xf32> to tensor<1024xbf16>
15
+ tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16>
16
+ tt.return
17
+ }
18
+ }
.triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.cubin ADDED
Binary file (10.5 kB). View file
 
.triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.llir ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
5
+
6
+ define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !7 {
7
+ %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
8
+ %4 = shl i32 %3, 1, !dbg !10
9
+ %5 = and i32 %4, 510, !dbg !10
10
+ %6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #4, !dbg !11
11
+ %7 = shl i32 %6, 9, !dbg !12
12
+ %8 = or i32 %7, %5, !dbg !13
13
+ %9 = sext i32 %8 to i64, !dbg !14
14
+ %10 = getelementptr i16, ptr addrspace(1) %0, i64 %9, !dbg !14
15
+ %11 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %10, i1 true) #4, !dbg !15
16
+ %12 = trunc i32 %11 to i16, !dbg !15
17
+ %extelt.offset = lshr i32 %11, 16, !dbg !15
18
+ %13 = trunc i32 %extelt.offset to i16, !dbg !15
19
+ %14 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %12) #4, !dbg !16
20
+ %15 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %13) #4, !dbg !16
21
+ %16 = fmul float %14, 0x3FE6A09E60000000, !dbg !17
22
+ %17 = fmul float %15, 0x3FE6A09E60000000, !dbg !17
23
+ %18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
24
+ %.not.i = icmp eq i32 %18, 0, !dbg !18
25
+ %19 = tail call float @llvm.nvvm.fabs.ftz.f(float %16) #4, !dbg !18
26
+ %20 = tail call float @llvm.nvvm.fabs.f(float %16) #4, !dbg !18
27
+ %.0.i = select i1 %.not.i, float %20, float %19, !dbg !18
28
+ %21 = fcmp oge float %.0.i, 0x3FF00C1FC0000000, !dbg !18
29
+ br i1 %21, label %__nv_fabsf.exit1.i, label %23, !dbg !18
30
+
31
+ __nv_fabsf.exit1.i: ; preds = %2
32
+ %22 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
33
+ %.not1.i = icmp eq i32 %22, 0, !dbg !18
34
+ %.01.i = select i1 %.not1.i, float %20, float %19, !dbg !18
35
+ br label %__internal_fmad.exit.i, !dbg !18
36
+
37
+ 23: ; preds = %2
38
+ %24 = fmul float %16, %16, !dbg !18
39
+ br label %__internal_fmad.exit.i, !dbg !18
40
+
41
+ __internal_fmad.exit.i: ; preds = %23, %__nv_fabsf.exit1.i
42
+ %25 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i ], [ 0x3FC06EBA60000000, %23 ], !dbg !18
43
+ %26 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i ], [ 0xBFD8127580000000, %23 ], !dbg !18
44
+ %27 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i ], [ 0x3FBCE315E0000000, %23 ], !dbg !18
45
+ %28 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i ], [ 0xBF9B837CE0000000, %23 ], !dbg !18
46
+ %29 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i ], [ 0x3F755ABD40000000, %23 ], !dbg !18
47
+ %30 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i ], [ 0xBF4AE9A400000000, %23 ], !dbg !18
48
+ %31 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i ], [ 0x3F163D2D40000000, %23 ], !dbg !18
49
+ %32 = phi float [ %.01.i, %__nv_fabsf.exit1.i ], [ %24, %23 ], !dbg !18
50
+ %33 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
51
+ %.not2.i = icmp eq i32 %33, 0, !dbg !18
52
+ %34 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %31, float %32, float %30) #4, !dbg !18
53
+ %35 = tail call float @llvm.nvvm.fma.rn.f(float %31, float %32, float %30) #4, !dbg !18
54
+ %.02.i = select i1 %.not2.i, float %35, float %34, !dbg !18
55
+ %36 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
56
+ %.not3.i = icmp eq i32 %36, 0, !dbg !18
57
+ %37 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float %32, float %29) #4, !dbg !18
58
+ %38 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float %32, float %29) #4, !dbg !18
59
+ %.03.i = select i1 %.not3.i, float %38, float %37, !dbg !18
60
+ %39 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
61
+ %.not4.i = icmp eq i32 %39, 0, !dbg !18
62
+ %40 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i, float %32, float %28) #4, !dbg !18
63
+ %41 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i, float %32, float %28) #4, !dbg !18
64
+ %.04.i = select i1 %.not4.i, float %41, float %40, !dbg !18
65
+ %42 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
66
+ %.not5.i = icmp eq i32 %42, 0, !dbg !18
67
+ %43 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i, float %32, float %27) #4, !dbg !18
68
+ %44 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i, float %32, float %27) #4, !dbg !18
69
+ %.05.i = select i1 %.not5.i, float %44, float %43, !dbg !18
70
+ %45 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
71
+ %.not6.i = icmp eq i32 %45, 0, !dbg !18
72
+ %46 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %32, float %26) #4, !dbg !18
73
+ %47 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %32, float %26) #4, !dbg !18
74
+ %.06.i = select i1 %.not6.i, float %47, float %46, !dbg !18
75
+ %48 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
76
+ %.not7.i = icmp eq i32 %48, 0, !dbg !18
77
+ %49 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i, float %32, float %25) #4, !dbg !18
78
+ %50 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i, float %32, float %25) #4, !dbg !18
79
+ %.07.i = select i1 %.not7.i, float %50, float %49, !dbg !18
80
+ %51 = fneg float %32, !dbg !18
81
+ %52 = select i1 %21, float %51, float %16, !dbg !18
82
+ %53 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
83
+ %.not8.i = icmp eq i32 %53, 0, !dbg !18
84
+ %54 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %52, float %52) #4, !dbg !18
85
+ %55 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %52, float %52) #4, !dbg !18
86
+ %.08.i = select i1 %.not8.i, float %55, float %54, !dbg !18
87
+ br i1 %21, label %56, label %__nv_erff.exit, !dbg !18
88
+
89
+ 56: ; preds = %__internal_fmad.exit.i
90
+ %57 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i) #4, !dbg !18
91
+ %58 = fsub float 1.000000e+00, %57, !dbg !18
92
+ %59 = bitcast float %58 to i32, !dbg !18
93
+ %60 = bitcast float %16 to i32, !dbg !18
94
+ %61 = and i32 %60, -2147483648, !dbg !18
95
+ %62 = or i32 %61, %59, !dbg !18
96
+ %63 = bitcast i32 %62 to float, !dbg !18
97
+ br label %__nv_erff.exit, !dbg !18
98
+
99
+ __nv_erff.exit: ; preds = %__internal_fmad.exit.i, %56
100
+ %r.0.i = phi float [ %63, %56 ], [ %.08.i, %__internal_fmad.exit.i ], !dbg !18
101
+ %64 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
102
+ %.not.i1 = icmp eq i32 %64, 0, !dbg !18
103
+ %65 = tail call float @llvm.nvvm.fabs.ftz.f(float %17) #4, !dbg !18
104
+ %66 = tail call float @llvm.nvvm.fabs.f(float %17) #4, !dbg !18
105
+ %.0.i2 = select i1 %.not.i1, float %66, float %65, !dbg !18
106
+ %67 = fcmp oge float %.0.i2, 0x3FF00C1FC0000000, !dbg !18
107
+ br i1 %67, label %__nv_fabsf.exit1.i19, label %69, !dbg !18
108
+
109
+ __nv_fabsf.exit1.i19: ; preds = %__nv_erff.exit
110
+ %68 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
111
+ %.not1.i20 = icmp eq i32 %68, 0, !dbg !18
112
+ %.01.i21 = select i1 %.not1.i20, float %66, float %65, !dbg !18
113
+ br label %__internal_fmad.exit.i3, !dbg !18
114
+
115
+ 69: ; preds = %__nv_erff.exit
116
+ %70 = fmul float %17, %17, !dbg !18
117
+ br label %__internal_fmad.exit.i3, !dbg !18
118
+
119
+ __internal_fmad.exit.i3: ; preds = %69, %__nv_fabsf.exit1.i19
120
+ %71 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i19 ], [ 0x3FC06EBA60000000, %69 ], !dbg !18
121
+ %72 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i19 ], [ 0xBFD8127580000000, %69 ], !dbg !18
122
+ %73 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i19 ], [ 0x3FBCE315E0000000, %69 ], !dbg !18
123
+ %74 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i19 ], [ 0xBF9B837CE0000000, %69 ], !dbg !18
124
+ %75 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i19 ], [ 0x3F755ABD40000000, %69 ], !dbg !18
125
+ %76 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i19 ], [ 0xBF4AE9A400000000, %69 ], !dbg !18
126
+ %77 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i19 ], [ 0x3F163D2D40000000, %69 ], !dbg !18
127
+ %78 = phi float [ %.01.i21, %__nv_fabsf.exit1.i19 ], [ %70, %69 ], !dbg !18
128
+ %79 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
129
+ %.not2.i4 = icmp eq i32 %79, 0, !dbg !18
130
+ %80 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %77, float %78, float %76) #4, !dbg !18
131
+ %81 = tail call float @llvm.nvvm.fma.rn.f(float %77, float %78, float %76) #4, !dbg !18
132
+ %.02.i5 = select i1 %.not2.i4, float %81, float %80, !dbg !18
133
+ %82 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
134
+ %.not3.i6 = icmp eq i32 %82, 0, !dbg !18
135
+ %83 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i5, float %78, float %75) #4, !dbg !18
136
+ %84 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i5, float %78, float %75) #4, !dbg !18
137
+ %.03.i7 = select i1 %.not3.i6, float %84, float %83, !dbg !18
138
+ %85 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
139
+ %.not4.i8 = icmp eq i32 %85, 0, !dbg !18
140
+ %86 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i7, float %78, float %74) #4, !dbg !18
141
+ %87 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i7, float %78, float %74) #4, !dbg !18
142
+ %.04.i9 = select i1 %.not4.i8, float %87, float %86, !dbg !18
143
+ %88 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
144
+ %.not5.i10 = icmp eq i32 %88, 0, !dbg !18
145
+ %89 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i9, float %78, float %73) #4, !dbg !18
146
+ %90 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i9, float %78, float %73) #4, !dbg !18
147
+ %.05.i11 = select i1 %.not5.i10, float %90, float %89, !dbg !18
148
+ %91 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
149
+ %.not6.i12 = icmp eq i32 %91, 0, !dbg !18
150
+ %92 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i11, float %78, float %72) #4, !dbg !18
151
+ %93 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i11, float %78, float %72) #4, !dbg !18
152
+ %.06.i13 = select i1 %.not6.i12, float %93, float %92, !dbg !18
153
+ %94 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
154
+ %.not7.i14 = icmp eq i32 %94, 0, !dbg !18
155
+ %95 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i13, float %78, float %71) #4, !dbg !18
156
+ %96 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i13, float %78, float %71) #4, !dbg !18
157
+ %.07.i15 = select i1 %.not7.i14, float %96, float %95, !dbg !18
158
+ %97 = fneg float %78, !dbg !18
159
+ %98 = select i1 %67, float %97, float %17, !dbg !18
160
+ %99 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
161
+ %.not8.i16 = icmp eq i32 %99, 0, !dbg !18
162
+ %100 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i15, float %98, float %98) #4, !dbg !18
163
+ %101 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i15, float %98, float %98) #4, !dbg !18
164
+ %.08.i17 = select i1 %.not8.i16, float %101, float %100, !dbg !18
165
+ br i1 %67, label %102, label %__nv_erff.exit22, !dbg !18
166
+
167
+ 102: ; preds = %__internal_fmad.exit.i3
168
+ %103 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i17) #4, !dbg !18
169
+ %104 = fsub float 1.000000e+00, %103, !dbg !18
170
+ %105 = bitcast float %104 to i32, !dbg !18
171
+ %106 = bitcast float %17 to i32, !dbg !18
172
+ %107 = and i32 %106, -2147483648, !dbg !18
173
+ %108 = or i32 %107, %105, !dbg !18
174
+ %109 = bitcast i32 %108 to float, !dbg !18
175
+ br label %__nv_erff.exit22, !dbg !18
176
+
177
+ __nv_erff.exit22: ; preds = %__internal_fmad.exit.i3, %102
178
+ %r.0.i18 = phi float [ %109, %102 ], [ %.08.i17, %__internal_fmad.exit.i3 ], !dbg !18
179
+ %110 = fmul float %15, 5.000000e-01, !dbg !19
180
+ %111 = fmul float %14, 5.000000e-01, !dbg !19
181
+ %112 = fadd float %r.0.i, 1.000000e+00, !dbg !20
182
+ %113 = fadd float %r.0.i18, 1.000000e+00, !dbg !20
183
+ %114 = fmul float %111, %112, !dbg !21
184
+ %115 = fmul float %110, %113, !dbg !21
185
+ %116 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %114) #4, !dbg !22
186
+ %117 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %115) #4, !dbg !22
187
+ %118 = insertelement <2 x i16> undef, i16 %116, i64 0, !dbg !22
188
+ %119 = insertelement <2 x i16> %118, i16 %117, i64 1, !dbg !22
189
+ %120 = bitcast <2 x i16> %119 to i32, !dbg !22
190
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %120, ptr addrspace(1) %10, i1 true) #4, !dbg !22
191
+ ret void, !dbg !23
192
+ }
193
+
194
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
195
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
196
+
197
+ ; Function Attrs: alwaysinline nounwind
198
+ define float @__nv_erff(float %a) local_unnamed_addr #1 {
199
+ __nv_fabsf.exit:
200
+ %0 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
201
+ %.not = icmp eq i32 %0, 0
202
+ %1 = tail call float @llvm.nvvm.fabs.ftz.f(float %a) #4
203
+ %2 = tail call float @llvm.nvvm.fabs.f(float %a) #4
204
+ %.0 = select i1 %.not, float %2, float %1
205
+ %3 = fcmp oge float %.0, 0x3FF00C1FC0000000
206
+ br i1 %3, label %__nv_fabsf.exit1, label %5
207
+
208
+ __nv_fabsf.exit1: ; preds = %__nv_fabsf.exit
209
+ %4 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
210
+ %.not1 = icmp eq i32 %4, 0
211
+ %.01 = select i1 %.not1, float %2, float %1
212
+ br label %__internal_fmad.exit
213
+
214
+ 5: ; preds = %__nv_fabsf.exit
215
+ %6 = fmul float %a, %a
216
+ br label %__internal_fmad.exit
217
+
218
+ __internal_fmad.exit: ; preds = %5, %__nv_fabsf.exit1
219
+ %7 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1 ], [ 0x3FC06EBA60000000, %5 ]
220
+ %8 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1 ], [ 0xBFD8127580000000, %5 ]
221
+ %9 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1 ], [ 0x3FBCE315E0000000, %5 ]
222
+ %10 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1 ], [ 0xBF9B837CE0000000, %5 ]
223
+ %11 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1 ], [ 0x3F755ABD40000000, %5 ]
224
+ %12 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1 ], [ 0xBF4AE9A400000000, %5 ]
225
+ %13 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1 ], [ 0x3F163D2D40000000, %5 ]
226
+ %14 = phi float [ %.01, %__nv_fabsf.exit1 ], [ %6, %5 ]
227
+ %15 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
228
+ %.not2 = icmp eq i32 %15, 0
229
+ %16 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %13, float %14, float %12) #4
230
+ %17 = tail call float @llvm.nvvm.fma.rn.f(float %13, float %14, float %12) #4
231
+ %.02 = select i1 %.not2, float %17, float %16
232
+ %18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
233
+ %.not3 = icmp eq i32 %18, 0
234
+ %19 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02, float %14, float %11) #4
235
+ %20 = tail call float @llvm.nvvm.fma.rn.f(float %.02, float %14, float %11) #4
236
+ %.03 = select i1 %.not3, float %20, float %19
237
+ %21 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
238
+ %.not4 = icmp eq i32 %21, 0
239
+ %22 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03, float %14, float %10) #4
240
+ %23 = tail call float @llvm.nvvm.fma.rn.f(float %.03, float %14, float %10) #4
241
+ %.04 = select i1 %.not4, float %23, float %22
242
+ %24 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
243
+ %.not5 = icmp eq i32 %24, 0
244
+ %25 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04, float %14, float %9) #4
245
+ %26 = tail call float @llvm.nvvm.fma.rn.f(float %.04, float %14, float %9) #4
246
+ %.05 = select i1 %.not5, float %26, float %25
247
+ %27 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
248
+ %.not6 = icmp eq i32 %27, 0
249
+ %28 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05, float %14, float %8) #4
250
+ %29 = tail call float @llvm.nvvm.fma.rn.f(float %.05, float %14, float %8) #4
251
+ %.06 = select i1 %.not6, float %29, float %28
252
+ %30 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
253
+ %.not7 = icmp eq i32 %30, 0
254
+ %31 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06, float %14, float %7) #4
255
+ %32 = tail call float @llvm.nvvm.fma.rn.f(float %.06, float %14, float %7) #4
256
+ %.07 = select i1 %.not7, float %32, float %31
257
+ %33 = fneg float %14
258
+ %34 = select i1 %3, float %33, float %a
259
+ %35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
260
+ %.not8 = icmp eq i32 %35, 0
261
+ %36 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07, float %34, float %34) #4
262
+ %37 = tail call float @llvm.nvvm.fma.rn.f(float %.07, float %34, float %34) #4
263
+ %.08 = select i1 %.not8, float %37, float %36
264
+ br i1 %3, label %38, label %46
265
+
266
+ 38: ; preds = %__internal_fmad.exit
267
+ %39 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08) #4
268
+ %40 = fsub float 1.000000e+00, %39
269
+ %41 = bitcast float %40 to i32
270
+ %42 = bitcast float %a to i32
271
+ %43 = and i32 %42, -2147483648
272
+ %44 = or i32 %43, %41
273
+ %45 = bitcast i32 %44 to float
274
+ br label %46
275
+
276
+ 46: ; preds = %38, %__internal_fmad.exit
277
+ %r.0 = phi float [ %45, %38 ], [ %.08, %__internal_fmad.exit ]
278
+ ret float %r.0
279
+ }
280
+
281
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #2
282
+
283
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
284
+ declare float @llvm.nvvm.fabs.ftz.f(float) #0
285
+
286
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
287
+ declare float @llvm.nvvm.fabs.f(float) #0
288
+
289
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
290
+ declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #0
291
+
292
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
293
+ declare float @llvm.nvvm.fma.rn.f(float, float, float) #0
294
+
295
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
296
+ declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
297
+
298
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
299
+ attributes #1 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
300
+ attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
301
+ attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
302
+ attributes #4 = { nounwind }
303
+
304
+ !llvm.module.flags = !{!0, !1}
305
+ !llvm.dbg.cu = !{!2}
306
+ !nvvm.annotations = !{!4, !5, !5, !4}
307
+ !llvm.ident = !{!6}
308
+
309
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
310
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
311
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
312
+ !3 = !DIFile(filename: "ckphrtdpgsxl7sfarkkzylhv4st3uhmzvg3u6z5excfp6ydybq74.py", directory: "/tmp/torchinductor_root/kp")
313
+ !4 = !{ptr @triton__0d1de, !"kernel", i32 1}
314
+ !5 = !{ptr @triton__0d1de, !"maxntidx", i32 256}
315
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
316
+ !7 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
317
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
318
+ !9 = !{}
319
+ !10 = !DILocation(line: 21, column: 36, scope: !7)
320
+ !11 = !DILocation(line: 20, column: 28, scope: !7)
321
+ !12 = !DILocation(line: 20, column: 33, scope: !7)
322
+ !13 = !DILocation(line: 21, column: 23, scope: !7)
323
+ !14 = !DILocation(line: 24, column: 34, scope: !7)
324
+ !15 = !DILocation(line: 24, column: 39, scope: !7)
325
+ !16 = !DILocation(line: 24, column: 48, scope: !7)
326
+ !17 = !DILocation(line: 29, column: 18, scope: !7)
327
+ !18 = !DILocation(line: 30, column: 23, scope: !7)
328
+ !19 = !DILocation(line: 27, column: 18, scope: !7)
329
+ !20 = !DILocation(line: 32, column: 18, scope: !7)
330
+ !21 = !DILocation(line: 33, column: 18, scope: !7)
331
+ !22 = !DILocation(line: 35, column: 40, scope: !7)
332
+ !23 = !DILocation(line: 35, column: 4, scope: !7)
.triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.ttir ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<1.000000e+00> : tensor<512xf32>
4
+ %cst_0 = arith.constant dense<0.707106769> : tensor<512xf32>
5
+ %cst_1 = arith.constant dense<5.000000e-01> : tensor<512xf32>
6
+ %c512_i32 = arith.constant 512 : i32
7
+ %0 = tt.get_program_id x : i32
8
+ %1 = arith.muli %0, %c512_i32 : i32
9
+ %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
10
+ %3 = tt.splat %1 : (i32) -> tensor<512xi32>
11
+ %4 = arith.addi %3, %2 : tensor<512xi32>
12
+ %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
13
+ %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
14
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16>
15
+ %8 = arith.extf %7 : tensor<512xbf16> to tensor<512xf32>
16
+ %9 = arith.mulf %8, %cst_1 : tensor<512xf32>
17
+ %10 = arith.mulf %8, %cst_0 : tensor<512xf32>
18
+ %11 = tt.extern_elementwise %10 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<512xf32>) -> tensor<512xf32>
19
+ %12 = arith.addf %11, %cst : tensor<512xf32>
20
+ %13 = arith.mulf %9, %12 : tensor<512xf32>
21
+ %14 = arith.truncf %13 : tensor<512xf32> to tensor<512xbf16>
22
+ tt.store %6, %14 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16>
23
+ tt.return
24
+ }
25
+ }
.triton/dump/246118bec10f09cdce32d0be7c22b5ae/triton_.ptx ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1de
10
+
11
+ .visible .entry triton__0d1de(
12
+ .param .u64 triton__0d1de_param_0,
13
+ .param .u64 triton__0d1de_param_1
14
+ )
15
+ .maxntid 256, 1, 1
16
+ {
17
+ .reg .pred %p<2>;
18
+ .reg .b32 %r<7>;
19
+ .reg .b64 %rd<7>;
20
+ .loc 1 18 0
21
+ $L__func_begin0:
22
+ .loc 1 18 0
23
+
24
+ ld.param.u64 %rd2, [triton__0d1de_param_0];
25
+ $L__tmp0:
26
+ .loc 1 21 36
27
+ mov.u32 %r4, %tid.x;
28
+ shl.b32 %r5, %r4, 1;
29
+ and.b32 %r6, %r5, 510;
30
+ .loc 1 20 28
31
+ mov.u32 %r1, %ctaid.x;
32
+ .loc 1 20 46
33
+ mul.wide.s32 %rd3, %r1, 512;
34
+ cvt.u64.u32 %rd4, %r6;
35
+ .loc 1 21 23
36
+ or.b64 %rd5, %rd3, %rd4;
37
+ .loc 1 25 25
38
+ shl.b64 %rd6, %rd5, 2;
39
+ add.s64 %rd1, %rd2, %rd6;
40
+ mov.b32 %r2, 0;
41
+ mov.pred %p1, -1;
42
+ .loc 1 25 36
43
+ @%p1 st.global.v2.b32 [ %rd1 + 0 ], { %r2, %r2 };
44
+ .loc 1 25 4
45
+ ret;
46
+ $L__tmp1:
47
+ $L__func_end0:
48
+
49
+ }
50
+ .file 1 "/tmp/torchinductor_root/pk/cpkw3bdoamlgzvqjeyuk34b3jcjf57htisara7lukflexo3t22ew.py"
51
+ .section .debug_abbrev
52
+ {
53
+ .b8 1
54
+ .b8 17
55
+ .b8 1
56
+ .b8 37
57
+ .b8 8
58
+ .b8 19
59
+ .b8 5
60
+ .b8 3
61
+ .b8 8
62
+ .b8 16
63
+ .b8 6
64
+ .b8 27
65
+ .b8 8
66
+ .b8 180
67
+ .b8 66
68
+ .b8 12
69
+ .b8 17
70
+ .b8 1
71
+ .b8 18
72
+ .b8 1
73
+ .b8 0
74
+ .b8 0
75
+ .b8 2
76
+ .b8 46
77
+ .b8 0
78
+ .b8 17
79
+ .b8 1
80
+ .b8 18
81
+ .b8 1
82
+ .b8 64
83
+ .b8 10
84
+ .b8 135
85
+ .b8 64
86
+ .b8 8
87
+ .b8 3
88
+ .b8 8
89
+ .b8 58
90
+ .b8 11
91
+ .b8 59
92
+ .b8 11
93
+ .b8 63
94
+ .b8 12
95
+ .b8 0
96
+ .b8 0
97
+ .b8 0
98
+ }
99
+ .section .debug_info
100
+ {
101
+ .b32 172
102
+ .b8 2
103
+ .b8 0
104
+ .b32 .debug_abbrev
105
+ .b8 8
106
+ .b8 1
107
+ .b8 116
108
+ .b8 114
109
+ .b8 105
110
+ .b8 116
111
+ .b8 111
112
+ .b8 110
113
+ .b8 0
114
+ .b8 2
115
+ .b8 0
116
+ .b8 99
117
+ .b8 112
118
+ .b8 107
119
+ .b8 119
120
+ .b8 51
121
+ .b8 98
122
+ .b8 100
123
+ .b8 111
124
+ .b8 97
125
+ .b8 109
126
+ .b8 108
127
+ .b8 103
128
+ .b8 122
129
+ .b8 118
130
+ .b8 113
131
+ .b8 106
132
+ .b8 101
133
+ .b8 121
134
+ .b8 117
135
+ .b8 107
136
+ .b8 51
137
+ .b8 52
138
+ .b8 98
139
+ .b8 51
140
+ .b8 106
141
+ .b8 99
142
+ .b8 106
143
+ .b8 102
144
+ .b8 53
145
+ .b8 55
146
+ .b8 104
147
+ .b8 116
148
+ .b8 105
149
+ .b8 115
150
+ .b8 97
151
+ .b8 114
152
+ .b8 97
153
+ .b8 55
154
+ .b8 108
155
+ .b8 117
156
+ .b8 107
157
+ .b8 102
158
+ .b8 108
159
+ .b8 101
160
+ .b8 120
161
+ .b8 111
162
+ .b8 51
163
+ .b8 116
164
+ .b8 50
165
+ .b8 50
166
+ .b8 101
167
+ .b8 119
168
+ .b8 46
169
+ .b8 112
170
+ .b8 121
171
+ .b8 0
172
+ .b32 .debug_line
173
+ .b8 47
174
+ .b8 116
175
+ .b8 109
176
+ .b8 112
177
+ .b8 47
178
+ .b8 116
179
+ .b8 111
180
+ .b8 114
181
+ .b8 99
182
+ .b8 104
183
+ .b8 105
184
+ .b8 110
185
+ .b8 100
186
+ .b8 117
187
+ .b8 99
188
+ .b8 116
189
+ .b8 111
190
+ .b8 114
191
+ .b8 95
192
+ .b8 114
193
+ .b8 111
194
+ .b8 111
195
+ .b8 116
196
+ .b8 47
197
+ .b8 112
198
+ .b8 107
199
+ .b8 0
200
+ .b8 1
201
+ .b64 $L__func_begin0
202
+ .b64 $L__func_end0
203
+ .b8 2
204
+ .b64 $L__func_begin0
205
+ .b64 $L__func_end0
206
+ .b8 1
207
+ .b8 156
208
+ .b8 116
209
+ .b8 114
210
+ .b8 105
211
+ .b8 116
212
+ .b8 111
213
+ .b8 110
214
+ .b8 95
215
+ .b8 95
216
+ .b8 48
217
+ .b8 100
218
+ .b8 49
219
+ .b8 100
220
+ .b8 101
221
+ .b8 0
222
+ .b8 116
223
+ .b8 114
224
+ .b8 105
225
+ .b8 116
226
+ .b8 111
227
+ .b8 110
228
+ .b8 95
229
+ .b8 95
230
+ .b8 48
231
+ .b8 100
232
+ .b8 49
233
+ .b8 100
234
+ .b8 101
235
+ .b8 0
236
+ .b8 1
237
+ .b8 18
238
+ .b8 1
239
+ .b8 0
240
+ }
241
+ .section .debug_pubnames
242
+ {
243
+ .b32 $L__pubNames_end0-$L__pubNames_start0
244
+ $L__pubNames_start0:
245
+ .b8 2
246
+ .b8 0
247
+ .b32 .debug_info
248
+ .b32 176
249
+ .b32 125
250
+ .b8 116
251
+ .b8 114
252
+ .b8 105
253
+ .b8 116
254
+ .b8 111
255
+ .b8 110
256
+ .b8 95
257
+ .b8 95
258
+ .b8 48
259
+ .b8 100
260
+ .b8 49
261
+ .b8 100
262
+ .b8 101
263
+ .b8 0
264
+ .b32 0
265
+ $L__pubNames_end0:
266
+ }
267
+ .section .debug_pubtypes
268
+ {
269
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
270
+ $L__pubTypes_start0:
271
+ .b8 2
272
+ .b8 0
273
+ .b32 .debug_info
274
+ .b32 176
275
+ .b32 0
276
+ $L__pubTypes_end0:
277
+ }
278
+ .section .debug_loc { }
.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.cubin ADDED
Binary file (7.07 kB). View file
 
.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.llir ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
7
+ %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %5 = and i32 %4, 127, !dbg !8
9
+ %6 = shl nuw nsw i32 %5, 3, !dbg !8
10
+ %7 = shl nuw nsw i32 %5, 2, !dbg !8
11
+ %8 = or i32 %7, 512, !dbg !8
12
+ %9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !9
13
+ %10 = shl i32 %9, 10, !dbg !10
14
+ %11 = or i32 %10, %6, !dbg !11
15
+ %12 = or i32 %10, %7, !dbg !11
16
+ %13 = or i32 %10, %8, !dbg !11
17
+ %14 = sext i32 %11 to i64, !dbg !12
18
+ %15 = getelementptr i16, ptr addrspace(1) %0, i64 %14, !dbg !12
19
+ %16 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %15, i1 true) #2, !dbg !13
20
+ %17 = extractvalue { i32, i32, i32, i32 } %16, 0, !dbg !13
21
+ %18 = extractvalue { i32, i32, i32, i32 } %16, 1, !dbg !13
22
+ %19 = extractvalue { i32, i32, i32, i32 } %16, 2, !dbg !13
23
+ %20 = extractvalue { i32, i32, i32, i32 } %16, 3, !dbg !13
24
+ %21 = trunc i32 %17 to i16, !dbg !13
25
+ %extelt.offset = lshr i32 %17, 16, !dbg !13
26
+ %22 = trunc i32 %extelt.offset to i16, !dbg !13
27
+ %23 = trunc i32 %18 to i16, !dbg !13
28
+ %extelt.offset1 = lshr i32 %18, 16, !dbg !13
29
+ %24 = trunc i32 %extelt.offset1 to i16, !dbg !13
30
+ %25 = trunc i32 %19 to i16, !dbg !13
31
+ %extelt.offset2 = lshr i32 %19, 16, !dbg !13
32
+ %26 = trunc i32 %extelt.offset2 to i16, !dbg !13
33
+ %27 = trunc i32 %20 to i16, !dbg !13
34
+ %extelt.offset3 = lshr i32 %20, 16, !dbg !13
35
+ %28 = trunc i32 %extelt.offset3 to i16, !dbg !13
36
+ %29 = zext nneg i32 %6 to i64, !dbg !14
37
+ %30 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %29, !dbg !14
38
+ %31 = insertelement <1 x i16> undef, i16 %21, i64 0, !dbg !14
39
+ store <1 x i16> %31, ptr addrspace(3) %30, align 2, !dbg !14
40
+ %32 = or i32 %6, 1, !dbg !14
41
+ %33 = zext nneg i32 %32 to i64, !dbg !14
42
+ %34 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %33, !dbg !14
43
+ %35 = insertelement <1 x i16> undef, i16 %22, i64 0, !dbg !14
44
+ store <1 x i16> %35, ptr addrspace(3) %34, align 2, !dbg !14
45
+ %36 = or i32 %6, 2, !dbg !14
46
+ %37 = zext nneg i32 %36 to i64, !dbg !14
47
+ %38 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %37, !dbg !14
48
+ %39 = insertelement <1 x i16> undef, i16 %23, i64 0, !dbg !14
49
+ store <1 x i16> %39, ptr addrspace(3) %38, align 2, !dbg !14
50
+ %40 = or i32 %6, 3, !dbg !14
51
+ %41 = zext nneg i32 %40 to i64, !dbg !14
52
+ %42 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %41, !dbg !14
53
+ %43 = insertelement <1 x i16> undef, i16 %24, i64 0, !dbg !14
54
+ store <1 x i16> %43, ptr addrspace(3) %42, align 2, !dbg !14
55
+ %44 = or i32 %6, 4, !dbg !14
56
+ %45 = zext nneg i32 %44 to i64, !dbg !14
57
+ %46 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %45, !dbg !14
58
+ %47 = insertelement <1 x i16> undef, i16 %25, i64 0, !dbg !14
59
+ store <1 x i16> %47, ptr addrspace(3) %46, align 2, !dbg !14
60
+ %48 = or i32 %6, 5, !dbg !14
61
+ %49 = zext nneg i32 %48 to i64, !dbg !14
62
+ %50 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %49, !dbg !14
63
+ %51 = insertelement <1 x i16> undef, i16 %26, i64 0, !dbg !14
64
+ store <1 x i16> %51, ptr addrspace(3) %50, align 2, !dbg !14
65
+ %52 = or i32 %6, 6, !dbg !14
66
+ %53 = zext nneg i32 %52 to i64, !dbg !14
67
+ %54 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %53, !dbg !14
68
+ %55 = insertelement <1 x i16> undef, i16 %27, i64 0, !dbg !14
69
+ store <1 x i16> %55, ptr addrspace(3) %54, align 2, !dbg !14
70
+ %56 = or i32 %6, 7, !dbg !14
71
+ %57 = zext nneg i32 %56 to i64, !dbg !14
72
+ %58 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %57, !dbg !14
73
+ %59 = insertelement <1 x i16> undef, i16 %28, i64 0, !dbg !14
74
+ store <1 x i16> %59, ptr addrspace(3) %58, align 2, !dbg !14
75
+ tail call void @llvm.nvvm.barrier0(), !dbg !14
76
+ %60 = zext nneg i32 %7 to i64, !dbg !14
77
+ %61 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %60, !dbg !14
78
+ %62 = load i16, ptr addrspace(3) %61, align 2, !dbg !14
79
+ %63 = or i32 %7, 1, !dbg !14
80
+ %64 = zext nneg i32 %63 to i64, !dbg !14
81
+ %65 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %64, !dbg !14
82
+ %66 = load i16, ptr addrspace(3) %65, align 2, !dbg !14
83
+ %67 = or i32 %7, 2, !dbg !14
84
+ %68 = zext nneg i32 %67 to i64, !dbg !14
85
+ %69 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %68, !dbg !14
86
+ %70 = load i16, ptr addrspace(3) %69, align 2, !dbg !14
87
+ %71 = or i32 %7, 3, !dbg !14
88
+ %72 = zext nneg i32 %71 to i64, !dbg !14
89
+ %73 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %72, !dbg !14
90
+ %74 = load i16, ptr addrspace(3) %73, align 2, !dbg !14
91
+ %75 = zext nneg i32 %8 to i64, !dbg !14
92
+ %76 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %75, !dbg !14
93
+ %77 = load i16, ptr addrspace(3) %76, align 2, !dbg !14
94
+ %78 = or i32 %7, 513, !dbg !14
95
+ %79 = zext nneg i32 %78 to i64, !dbg !14
96
+ %80 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %79, !dbg !14
97
+ %81 = load i16, ptr addrspace(3) %80, align 2, !dbg !14
98
+ %82 = or i32 %7, 514, !dbg !14
99
+ %83 = zext nneg i32 %82 to i64, !dbg !14
100
+ %84 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %83, !dbg !14
101
+ %85 = load i16, ptr addrspace(3) %84, align 2, !dbg !14
102
+ %86 = or i32 %7, 515, !dbg !14
103
+ %87 = zext nneg i32 %86 to i64, !dbg !14
104
+ %88 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %87, !dbg !14
105
+ %89 = load i16, ptr addrspace(3) %88, align 2, !dbg !14
106
+ %90 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %62) #2, !dbg !14
107
+ %91 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %66) #2, !dbg !14
108
+ %92 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %70) #2, !dbg !14
109
+ %93 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %74) #2, !dbg !14
110
+ %94 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %77) #2, !dbg !14
111
+ %95 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %81) #2, !dbg !14
112
+ %96 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %85) #2, !dbg !14
113
+ %97 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %89) #2, !dbg !14
114
+ %98 = sext i32 %12 to i64, !dbg !15
115
+ %99 = getelementptr float, ptr addrspace(1) %1, i64 %98, !dbg !15
116
+ %100 = sext i32 %13 to i64, !dbg !15
117
+ %101 = getelementptr float, ptr addrspace(1) %1, i64 %100, !dbg !15
118
+ %102 = bitcast float %90 to i32, !dbg !16
119
+ %103 = bitcast float %91 to i32, !dbg !16
120
+ %104 = bitcast float %92 to i32, !dbg !16
121
+ %105 = bitcast float %93 to i32, !dbg !16
122
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %102, i32 %103, i32 %104, i32 %105, ptr addrspace(1) %99, i1 true) #2, !dbg !16
123
+ %106 = bitcast float %94 to i32, !dbg !16
124
+ %107 = bitcast float %95 to i32, !dbg !16
125
+ %108 = bitcast float %96 to i32, !dbg !16
126
+ %109 = bitcast float %97 to i32, !dbg !16
127
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %106, i32 %107, i32 %108, i32 %109, ptr addrspace(1) %101, i1 true) #2, !dbg !16
128
+ ret void, !dbg !17
129
+ }
130
+
131
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
132
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
133
+
134
+ ; Function Attrs: convergent nocallback nounwind
135
+ declare void @llvm.nvvm.barrier0() #1
136
+
137
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
138
+ attributes #1 = { convergent nocallback nounwind }
139
+ attributes #2 = { nounwind }
140
+
141
+ !llvm.module.flags = !{!0}
142
+ !llvm.dbg.cu = !{!1}
143
+ !nvvm.annotations = !{!3, !4, !4, !3}
144
+
145
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
146
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
147
+ !2 = !DIFile(filename: "ck62k2xzbb657snfdowwanzszaij6qzw6vuc7cfidomjpkk6igcm.py", directory: "/tmp/torchinductor_root/k6")
148
+ !3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
149
+ !4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
150
+ !5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
151
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
152
+ !7 = !{}
153
+ !8 = !DILocation(line: 21, column: 36, scope: !5)
154
+ !9 = !DILocation(line: 20, column: 28, scope: !5)
155
+ !10 = !DILocation(line: 20, column: 33, scope: !5)
156
+ !11 = !DILocation(line: 21, column: 23, scope: !5)
157
+ !12 = !DILocation(line: 24, column: 30, scope: !5)
158
+ !13 = !DILocation(line: 24, column: 35, scope: !5)
159
+ !14 = !DILocation(line: 24, column: 44, scope: !5)
160
+ !15 = !DILocation(line: 26, column: 25, scope: !5)
161
+ !16 = !DILocation(line: 26, column: 36, scope: !5)
162
+ !17 = !DILocation(line: 26, column: 4, scope: !5)
.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ptx ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2de(
13
+ .param .u64 triton__0d1d2de_param_0,
14
+ .param .u64 triton__0d1d2de_param_1,
15
+ .param .u32 triton__0d1d2de_param_2
16
+ )
17
+ .maxntid 128, 1, 1
18
+ {
19
+ .reg .pred %p<4>;
20
+ .reg .b16 %rs<9>;
21
+ .reg .b32 %r<37>;
22
+ .reg .b64 %rd<13>;
23
+ .loc 1 18 0
24
+ $L__func_begin0:
25
+ .loc 1 18 0
26
+
27
+ ld.param.u64 %rd4, [triton__0d1d2de_param_0];
28
+ ld.param.u64 %rd5, [triton__0d1d2de_param_1];
29
+ $L__tmp0:
30
+ .loc 1 21 36
31
+ mov.u32 %r22, %tid.x;
32
+ and.b32 %r23, %r22, 127;
33
+ shl.b32 %r24, %r23, 3;
34
+ shl.b32 %r25, %r23, 2;
35
+ .loc 1 20 28
36
+ mov.u32 %r1, %ctaid.x;
37
+ .loc 1 20 33
38
+ shl.b32 %r26, %r1, 10;
39
+ .loc 1 21 23
40
+ or.b32 %r27, %r26, %r24;
41
+ or.b32 %r28, %r26, %r25;
42
+ .loc 1 24 30
43
+ mul.wide.s32 %rd6, %r27, 2;
44
+ add.s64 %rd1, %rd4, %rd6;
45
+ mov.pred %p1, -1;
46
+ .loc 1 24 35
47
+ mov.u32 %r2, 0x0;
48
+ mov.u32 %r3, 0x0;
49
+ mov.u32 %r4, 0x0;
50
+ mov.u32 %r5, 0x0;
51
+ @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
52
+ shr.u32 %r29, %r2, 16;
53
+ shr.u32 %r30, %r3, 16;
54
+ shr.u32 %r31, %r4, 16;
55
+ shr.u32 %r32, %r5, 16;
56
+ .loc 1 24 44
57
+ shl.b32 %r33, %r23, 4;
58
+ mov.u32 %r34, global_smem;
59
+ add.s32 %r35, %r34, %r33;
60
+ st.shared.u16 [%r35], %r2;
61
+ st.shared.u16 [%r35+2], %r29;
62
+ st.shared.u16 [%r35+4], %r3;
63
+ st.shared.u16 [%r35+6], %r30;
64
+ st.shared.u16 [%r35+8], %r4;
65
+ st.shared.u16 [%r35+10], %r31;
66
+ st.shared.u16 [%r35+12], %r5;
67
+ st.shared.u16 [%r35+14], %r32;
68
+ bar.sync 0;
69
+ add.s32 %r36, %r34, %r24;
70
+ ld.shared.u16 %rs1, [%r36];
71
+ ld.shared.u16 %rs2, [%r36+2];
72
+ ld.shared.u16 %rs3, [%r36+4];
73
+ ld.shared.u16 %rs4, [%r36+6];
74
+ ld.shared.u16 %rs5, [%r36+1024];
75
+ ld.shared.u16 %rs6, [%r36+1026];
76
+ ld.shared.u16 %rs7, [%r36+1028];
77
+ ld.shared.u16 %rs8, [%r36+1030];
78
+ cvt.f32.bf16 %r14, %rs1;
79
+ cvt.f32.bf16 %r15, %rs2;
80
+ cvt.f32.bf16 %r16, %rs3;
81
+ cvt.f32.bf16 %r17, %rs4;
82
+ cvt.f32.bf16 %r18, %rs5;
83
+ cvt.f32.bf16 %r19, %rs6;
84
+ cvt.f32.bf16 %r20, %rs7;
85
+ cvt.f32.bf16 %r21, %rs8;
86
+ .loc 1 26 25
87
+ mul.wide.s32 %rd7, %r28, 4;
88
+ add.s64 %rd2, %rd5, %rd7;
89
+ cvt.s64.s32 %rd8, %r26;
90
+ cvt.u64.u32 %rd9, %r25;
91
+ or.b64 %rd10, %rd8, %rd9;
92
+ shl.b64 %rd11, %rd10, 2;
93
+ add.s64 %rd12, %rd5, %rd11;
94
+ add.s64 %rd3, %rd12, 2048;
95
+ .loc 1 26 36
96
+ @%p1 st.global.v4.b32 [ %rd2 + 0 ], { %r14, %r15, %r16, %r17 };
97
+ @%p1 st.global.v4.b32 [ %rd3 + 0 ], { %r18, %r19, %r20, %r21 };
98
+ .loc 1 26 4
99
+ ret;
100
+ $L__tmp1:
101
+ $L__func_end0:
102
+
103
+ }
104
+ .file 1 "/tmp/torchinductor_root/k6/ck62k2xzbb657snfdowwanzszaij6qzw6vuc7cfidomjpkk6igcm.py"
105
+ .section .debug_abbrev
106
+ {
107
+ .b8 1
108
+ .b8 17
109
+ .b8 1
110
+ .b8 37
111
+ .b8 8
112
+ .b8 19
113
+ .b8 5
114
+ .b8 3
115
+ .b8 8
116
+ .b8 16
117
+ .b8 6
118
+ .b8 27
119
+ .b8 8
120
+ .b8 180
121
+ .b8 66
122
+ .b8 12
123
+ .b8 17
124
+ .b8 1
125
+ .b8 18
126
+ .b8 1
127
+ .b8 0
128
+ .b8 0
129
+ .b8 2
130
+ .b8 46
131
+ .b8 0
132
+ .b8 17
133
+ .b8 1
134
+ .b8 18
135
+ .b8 1
136
+ .b8 64
137
+ .b8 10
138
+ .b8 135
139
+ .b8 64
140
+ .b8 8
141
+ .b8 3
142
+ .b8 8
143
+ .b8 58
144
+ .b8 11
145
+ .b8 59
146
+ .b8 11
147
+ .b8 63
148
+ .b8 12
149
+ .b8 0
150
+ .b8 0
151
+ .b8 0
152
+ }
153
+ .section .debug_info
154
+ {
155
+ .b32 176
156
+ .b8 2
157
+ .b8 0
158
+ .b32 .debug_abbrev
159
+ .b8 8
160
+ .b8 1
161
+ .b8 116
162
+ .b8 114
163
+ .b8 105
164
+ .b8 116
165
+ .b8 111
166
+ .b8 110
167
+ .b8 0
168
+ .b8 2
169
+ .b8 0
170
+ .b8 99
171
+ .b8 107
172
+ .b8 54
173
+ .b8 50
174
+ .b8 107
175
+ .b8 50
176
+ .b8 120
177
+ .b8 122
178
+ .b8 98
179
+ .b8 98
180
+ .b8 54
181
+ .b8 53
182
+ .b8 55
183
+ .b8 115
184
+ .b8 110
185
+ .b8 102
186
+ .b8 100
187
+ .b8 111
188
+ .b8 119
189
+ .b8 119
190
+ .b8 97
191
+ .b8 110
192
+ .b8 122
193
+ .b8 115
194
+ .b8 122
195
+ .b8 97
196
+ .b8 105
197
+ .b8 106
198
+ .b8 54
199
+ .b8 113
200
+ .b8 122
201
+ .b8 119
202
+ .b8 54
203
+ .b8 118
204
+ .b8 117
205
+ .b8 99
206
+ .b8 55
207
+ .b8 99
208
+ .b8 102
209
+ .b8 105
210
+ .b8 100
211
+ .b8 111
212
+ .b8 109
213
+ .b8 106
214
+ .b8 112
215
+ .b8 107
216
+ .b8 107
217
+ .b8 54
218
+ .b8 105
219
+ .b8 103
220
+ .b8 99
221
+ .b8 109
222
+ .b8 46
223
+ .b8 112
224
+ .b8 121
225
+ .b8 0
226
+ .b32 .debug_line
227
+ .b8 47
228
+ .b8 116
229
+ .b8 109
230
+ .b8 112
231
+ .b8 47
232
+ .b8 116
233
+ .b8 111
234
+ .b8 114
235
+ .b8 99
236
+ .b8 104
237
+ .b8 105
238
+ .b8 110
239
+ .b8 100
240
+ .b8 117
241
+ .b8 99
242
+ .b8 116
243
+ .b8 111
244
+ .b8 114
245
+ .b8 95
246
+ .b8 114
247
+ .b8 111
248
+ .b8 111
249
+ .b8 116
250
+ .b8 47
251
+ .b8 107
252
+ .b8 54
253
+ .b8 0
254
+ .b8 1
255
+ .b64 $L__func_begin0
256
+ .b64 $L__func_end0
257
+ .b8 2
258
+ .b64 $L__func_begin0
259
+ .b64 $L__func_end0
260
+ .b8 1
261
+ .b8 156
262
+ .b8 116
263
+ .b8 114
264
+ .b8 105
265
+ .b8 116
266
+ .b8 111
267
+ .b8 110
268
+ .b8 95
269
+ .b8 95
270
+ .b8 48
271
+ .b8 100
272
+ .b8 49
273
+ .b8 100
274
+ .b8 50
275
+ .b8 100
276
+ .b8 101
277
+ .b8 0
278
+ .b8 116
279
+ .b8 114
280
+ .b8 105
281
+ .b8 116
282
+ .b8 111
283
+ .b8 110
284
+ .b8 95
285
+ .b8 95
286
+ .b8 48
287
+ .b8 100
288
+ .b8 49
289
+ .b8 100
290
+ .b8 50
291
+ .b8 100
292
+ .b8 101
293
+ .b8 0
294
+ .b8 1
295
+ .b8 18
296
+ .b8 1
297
+ .b8 0
298
+ }
299
+ .section .debug_pubnames
300
+ {
301
+ .b32 $L__pubNames_end0-$L__pubNames_start0
302
+ $L__pubNames_start0:
303
+ .b8 2
304
+ .b8 0
305
+ .b32 .debug_info
306
+ .b32 180
307
+ .b32 125
308
+ .b8 116
309
+ .b8 114
310
+ .b8 105
311
+ .b8 116
312
+ .b8 111
313
+ .b8 110
314
+ .b8 95
315
+ .b8 95
316
+ .b8 48
317
+ .b8 100
318
+ .b8 49
319
+ .b8 100
320
+ .b8 50
321
+ .b8 100
322
+ .b8 101
323
+ .b8 0
324
+ .b32 0
325
+ $L__pubNames_end0:
326
+ }
327
+ .section .debug_pubtypes
328
+ {
329
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
330
+ $L__pubTypes_start0:
331
+ .b8 2
332
+ .b8 0
333
+ .b32 .debug_info
334
+ .b32 180
335
+ .b32 0
336
+ $L__pubTypes_end0:
337
+ }
338
+ .section .debug_loc { }
.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ttgir ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
5
+ %c1024_i32 = arith.constant 1024 : i32
6
+ %0 = tt.get_program_id x : i32
7
+ %1 = arith.muli %0, %c1024_i32 : i32
8
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
9
+ %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked1>
10
+ %4 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
11
+ %5 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked1>
12
+ %6 = arith.addi %4, %2 : tensor<1024xi32, #blocked>
13
+ %7 = arith.addi %5, %3 : tensor<1024xi32, #blocked1>
14
+ %8 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
15
+ %9 = tt.addptr %8, %6 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
16
+ %10 = tt.load %9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
17
+ %11 = triton_gpu.convert_layout %10 : (tensor<1024xbf16, #blocked>) -> tensor<1024xbf16, #blocked1>
18
+ %12 = arith.extf %11 : tensor<1024xbf16, #blocked1> to tensor<1024xf32, #blocked1>
19
+ %13 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked1>
20
+ %14 = tt.addptr %13, %7 : tensor<1024x!tt.ptr<f32, 1>, #blocked1>, tensor<1024xi32, #blocked1>
21
+ tt.store %14, %12 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked1>
22
+ tt.return
23
+ }
24
+ }
.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ttir ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c1024_i32 = arith.constant 1024 : i32
4
+ %0 = tt.get_program_id x : i32
5
+ %1 = arith.muli %0, %c1024_i32 : i32
6
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
7
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
8
+ %4 = arith.addi %3, %2 : tensor<1024xi32>
9
+ %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
10
+ %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
11
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
12
+ %8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32>
13
+ %9 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
14
+ %10 = tt.addptr %9, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
15
+ tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32>
16
+ tt.return
17
+ }
18
+ }
.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.cubin ADDED
Binary file (10.3 kB). View file
 
.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.llir ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
7
+ %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %7 = and i32 %6, 63, !dbg !8
9
+ %8 = lshr i32 %6, 6, !dbg !9
10
+ %9 = and i32 %8, 3, !dbg !9
11
+ %10 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
12
+ %11 = shl i32 %10, 6, !dbg !11
13
+ %12 = or i32 %11, %7, !dbg !12
14
+ br label %13, !dbg !13
15
+
16
+ 13: ; preds = %5, %13
17
+ %14 = phi float [ 0.000000e+00, %5 ], [ %23, %13 ]
18
+ %15 = phi i32 [ 0, %5 ], [ %24, %13 ]
19
+ %16 = or i32 %15, %9, !dbg !14
20
+ %17 = shl i32 %16, 17, !dbg !15
21
+ %18 = add i32 %17, %12, !dbg !16
22
+ %19 = sext i32 %18 to i64, !dbg !17
23
+ %20 = getelementptr float, ptr addrspace(1) %0, i64 %19, !dbg !17
24
+ %21 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %20, i1 true, i32 0, i1 true) #3, !dbg !18
25
+ %22 = bitcast i32 %21 to float, !dbg !18
26
+ %23 = fadd float %14, %22, !dbg !19
27
+ %24 = add nuw nsw i32 %15, 4, !dbg !13
28
+ %25 = icmp ult i32 %15, 116, !dbg !13
29
+ br i1 %25, label %13, label %26, !dbg !13
30
+
31
+ 26: ; preds = %13
32
+ %27 = shl nuw nsw i32 %7, 2, !dbg !20
33
+ %28 = or i32 %27, %9, !dbg !20
34
+ %29 = zext nneg i32 %28 to i64, !dbg !20
35
+ %30 = getelementptr float, ptr addrspace(3) @global_smem, i64 %29, !dbg !20
36
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %30, float %23, i1 true) #3, !dbg !20
37
+ tail call void @llvm.nvvm.barrier0(), !dbg !20
38
+ %31 = icmp slt i32 %6, 256, !dbg !20
39
+ %32 = sext i32 %6 to i64, !dbg !20
40
+ %33 = getelementptr float, ptr addrspace(3) @global_smem, i64 %32, !dbg !20
41
+ %34 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %33, i1 %31) #3, !dbg !20
42
+ %35 = bitcast float %34 to i32, !dbg !20
43
+ %36 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %35, i32 2, i32 31), !dbg !20
44
+ %37 = bitcast i32 %36 to float, !dbg !20
45
+ %38 = fadd float %34, %37, !dbg !24
46
+ %39 = bitcast float %38 to i32, !dbg !20
47
+ %40 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %39, i32 1, i32 31), !dbg !20
48
+ %41 = bitcast i32 %40 to float, !dbg !20
49
+ %42 = fadd float %38, %41, !dbg !24
50
+ %43 = and i32 %6, 3, !dbg !20
51
+ %44 = icmp eq i32 %43, 0, !dbg !20
52
+ %45 = and i1 %31, %44, !dbg !20
53
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %33, float %42, i1 %45) #3, !dbg !20
54
+ tail call void @llvm.nvvm.barrier0(), !dbg !20
55
+ %46 = zext nneg i32 %27 to i64, !dbg !20
56
+ %47 = getelementptr float, ptr addrspace(3) @global_smem, i64 %46, !dbg !20
57
+ %48 = load float, ptr addrspace(3) %47, align 4, !dbg !20
58
+ %.frozen = freeze i32 %12
59
+ %49 = sdiv i32 %.frozen, 256, !dbg !28
60
+ %50 = mul i32 %49, 256
61
+ %.decomposed = sub i32 %.frozen, %50
62
+ %51 = sext i32 %49 to i64, !dbg !29
63
+ %52 = getelementptr i64, ptr addrspace(1) %1, i64 %51, !dbg !29
64
+ %53 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %52, i1 true) #3, !dbg !30
65
+ %54 = lshr i64 %53, 54, !dbg !31
66
+ %55 = and i64 %54, 512, !dbg !31
67
+ %56 = add i64 %55, %53, !dbg !31
68
+ %57 = shl i64 %56, 8, !dbg !32
69
+ %58 = sext i32 %.decomposed to i64, !dbg !33
70
+ %59 = getelementptr float, ptr addrspace(1) %2, i64 %57, !dbg !34
71
+ %60 = getelementptr float, ptr addrspace(1) %59, i64 %58, !dbg !34
72
+ %61 = icmp eq i32 %9, 0, !dbg !35
73
+ %62 = insertelement <1 x float> undef, float %48, i64 0, !dbg !35
74
+ %63 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %60, <1 x float> %62, i1 %61) #3, !dbg !35
75
+ ret void, !dbg !36
76
+ }
77
+
78
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
79
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
80
+
81
+ ; Function Attrs: convergent nocallback nounwind
82
+ declare void @llvm.nvvm.barrier0() #1
83
+
84
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
85
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
86
+
87
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
88
+ attributes #1 = { convergent nocallback nounwind }
89
+ attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
90
+ attributes #3 = { nounwind }
91
+
92
+ !llvm.module.flags = !{!0}
93
+ !llvm.dbg.cu = !{!1}
94
+ !nvvm.annotations = !{!3, !4, !4, !3}
95
+
96
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
97
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
98
+ !2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i")
99
+ !3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1}
100
+ !4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 256}
101
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
102
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
103
+ !7 = !{}
104
+ !8 = !DILocation(line: 22, column: 44, scope: !5)
105
+ !9 = !DILocation(line: 24, column: 33, scope: !5)
106
+ !10 = !DILocation(line: 21, column: 28, scope: !5)
107
+ !11 = !DILocation(line: 21, column: 33, scope: !5)
108
+ !12 = !DILocation(line: 22, column: 23, scope: !5)
109
+ !13 = !DILocation(line: 27, column: 36, scope: !5)
110
+ !14 = !DILocation(line: 28, column: 27, scope: !5)
111
+ !15 = !DILocation(line: 31, column: 47, scope: !5)
112
+ !16 = !DILocation(line: 31, column: 40, scope: !5)
113
+ !17 = !DILocation(line: 31, column: 34, scope: !5)
114
+ !18 = !DILocation(line: 31, column: 53, scope: !5)
115
+ !19 = !DILocation(line: 34, column: 38, scope: !5)
116
+ !20 = !DILocation(line: 243, column: 36, scope: !21, inlinedAt: !23)
117
+ !21 = distinct !DILexicalBlockFile(scope: !5, file: !22, discriminator: 0)
118
+ !22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
119
+ !23 = !DILocation(line: 35, column: 25, scope: !21)
120
+ !24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26)
121
+ !25 = distinct !DILexicalBlockFile(scope: !21, file: !22, discriminator: 0)
122
+ !26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27)
123
+ !27 = !DILocation(line: 35, column: 25, scope: !25)
124
+ !28 = !DILocation(line: 36, column: 20, scope: !5)
125
+ !29 = !DILocation(line: 38, column: 30, scope: !5)
126
+ !30 = !DILocation(line: 38, column: 35, scope: !5)
127
+ !31 = !DILocation(line: 41, column: 32, scope: !5)
128
+ !32 = !DILocation(line: 45, column: 40, scope: !5)
129
+ !33 = !DILocation(line: 45, column: 36, scope: !5)
130
+ !34 = !DILocation(line: 45, column: 30, scope: !5)
131
+ !35 = !DILocation(line: 45, column: 55, scope: !5)
132
+ !36 = !DILocation(line: 45, column: 4, scope: !5)
.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ttgir ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<256> : tensor<64x1xi64, #blocked>
5
+ %cst_0 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
6
+ %cst_1 = arith.constant dense<512> : tensor<64x1xi64, #blocked>
7
+ %cst_2 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
8
+ %cst_3 = arith.constant dense<131072> : tensor<1x4xi32, #blocked>
9
+ %cst_4 = arith.constant dense<120> : tensor<1x4xi32, #blocked>
10
+ %c0_i32 = arith.constant 0 : i32
11
+ %c120_i32 = arith.constant 120 : i32
12
+ %c4_i32 = arith.constant 4 : i32
13
+ %cst_5 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked>
14
+ %cst_6 = arith.constant dense<true> : tensor<64x1xi1, #blocked>
15
+ %c64_i32 = arith.constant 64 : i32
16
+ %0 = tt.get_program_id x : i32
17
+ %1 = arith.muli %0, %c64_i32 : i32
18
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
19
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
20
+ %4 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
21
+ %5 = arith.addi %4, %3 : tensor<64x1xi32, #blocked>
22
+ %6 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
23
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x4xi32, #blocked>
24
+ %8 = tt.broadcast %5 : (tensor<64x1xi32, #blocked>) -> tensor<64x4xi32, #blocked>
25
+ %9 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>, #blocked>
26
+ %10 = scf.for %arg5 = %c0_i32 to %c120_i32 step %c4_i32 iter_args(%arg6 = %cst_5) -> (tensor<64x4xf32, #blocked>) : i32 {
27
+ %27 = tt.splat %arg5 : (i32) -> tensor<1x4xi32, #blocked>
28
+ %28 = arith.addi %27, %7 : tensor<1x4xi32, #blocked>
29
+ %29 = arith.cmpi slt, %28, %cst_4 : tensor<1x4xi32, #blocked>
30
+ %30 = arith.muli %28, %cst_3 : tensor<1x4xi32, #blocked>
31
+ %31 = tt.broadcast %30 : (tensor<1x4xi32, #blocked>) -> tensor<64x4xi32, #blocked>
32
+ %32 = arith.addi %8, %31 : tensor<64x4xi32, #blocked>
33
+ %33 = tt.addptr %9, %32 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi32, #blocked>
34
+ %34 = tt.broadcast %29 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked>
35
+ %35 = tt.load %33, %34, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
36
+ %36 = arith.addf %arg6, %35 : tensor<64x4xf32, #blocked>
37
+ %37 = arith.select %34, %36, %arg6 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
38
+ scf.yield %37 : tensor<64x4xf32, #blocked>
39
+ }
40
+ %11 = "tt.reduce"(%10) <{axis = 1 : i32}> ({
41
+ ^bb0(%arg5: f32, %arg6: f32):
42
+ %27 = arith.addf %arg5, %arg6 : f32
43
+ tt.reduce.return %27 : f32
44
+ }) : (tensor<64x4xf32, #blocked>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
45
+ %12 = tt.expand_dims %11 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
46
+ %13 = arith.divsi %5, %cst_2 : tensor<64x1xi32, #blocked>
47
+ %14 = arith.remsi %5, %cst_2 : tensor<64x1xi32, #blocked>
48
+ %15 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
49
+ %16 = tt.addptr %15, %13 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
50
+ %17 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
51
+ %18 = arith.addi %17, %cst_1 : tensor<64x1xi64, #blocked>
52
+ %19 = arith.cmpi slt, %17, %cst_0 : tensor<64x1xi64, #blocked>
53
+ %20 = arith.select %19, %18, %17 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
54
+ %21 = arith.muli %20, %cst : tensor<64x1xi64, #blocked>
55
+ %22 = arith.extsi %14 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked>
56
+ %23 = arith.addi %22, %21 : tensor<64x1xi64, #blocked>
57
+ %24 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>, #blocked>
58
+ %25 = tt.addptr %24, %23 : tensor<64x1x!tt.ptr<f32, 1>, #blocked>, tensor<64x1xi64, #blocked>
59
+ %26 = "tt.atomic_rmw"(%25, %12, %cst_6) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<64x1x!tt.ptr<f32, 1>, #blocked>, tensor<64x1xf32, #blocked>, tensor<64x1xi1, #blocked>) -> tensor<64x1xf32, #blocked>
60
+ tt.return
61
+ }
62
+ }
.triton/dump/415aac87553b7d064f52694fa7254686/triton_.cubin ADDED
Binary file (24.1 kB). View file
 
.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttgir ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
5
+ %cst_0 = arith.constant 9.99999974E-6 : f32
6
+ %cst_1 = arith.constant 2.560000e+02 : f32
7
+ %cst_2 = arith.constant 0.000000e+00 : f32
8
+ %c256_i32 = arith.constant 256 : i32
9
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
10
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
11
+ %0 = tt.get_program_id x : i32
12
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
13
+ %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
14
+ %3 = arith.muli %0, %c256_i32 : i32
15
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
16
+ %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
17
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
18
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
19
+ %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
20
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
21
+ %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
22
+ %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
23
+ %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
24
+ %13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
25
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
26
+ %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
27
+ %16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
28
+ %17 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
29
+ %18 = tt.addptr %17, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
30
+ %19 = tt.load %18, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
31
+ %20 = arith.addf %8, %12 : tensor<256xf32, #blocked>
32
+ %21 = arith.addf %20, %16 : tensor<256xf32, #blocked>
33
+ %22 = arith.select %2, %21, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
34
+ %23 = "tt.reduce"(%22) <{axis = 0 : i32}> ({
35
+ ^bb0(%arg7: f32, %arg8: f32):
36
+ %40 = arith.addf %arg7, %arg8 : f32
37
+ tt.reduce.return %40 : f32
38
+ }) : (tensor<256xf32, #blocked>) -> f32
39
+ %24 = arith.addf %23, %cst_2 : f32
40
+ %25 = arith.divf %24, %cst_1 : f32
41
+ %26 = tt.splat %25 : (f32) -> tensor<256xf32, #blocked>
42
+ %27 = arith.subf %21, %26 : tensor<256xf32, #blocked>
43
+ %28 = arith.mulf %27, %27 : tensor<256xf32, #blocked>
44
+ %29 = arith.select %2, %28, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
45
+ %30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({
46
+ ^bb0(%arg7: f32, %arg8: f32):
47
+ %40 = arith.addf %arg7, %arg8 : f32
48
+ tt.reduce.return %40 : f32
49
+ }) : (tensor<256xf32, #blocked>) -> f32
50
+ %31 = arith.addf %30, %cst_2 : f32
51
+ %32 = arith.divf %31, %cst_1 : f32
52
+ %33 = arith.addf %32, %cst_0 : f32
53
+ %34 = tt.extern_elementwise %33 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
54
+ %35 = tt.splat %34 : (f32) -> tensor<256xf32, #blocked>
55
+ %36 = arith.mulf %27, %35 : tensor<256xf32, #blocked>
56
+ %37 = arith.mulf %36, %19 : tensor<256xf32, #blocked>
57
+ %38 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
58
+ %39 = tt.addptr %38, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
59
+ tt.store %39, %37, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
60
+ tt.return
61
+ }
62
+ }
.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.cubin ADDED
Binary file (7.46 kB). View file
 
.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.ptx ADDED
@@ -0,0 +1,465 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d34e
10
+
11
+ .visible .entry triton__0d1d2d34e(
12
+ .param .u64 triton__0d1d2d34e_param_0,
13
+ .param .u64 triton__0d1d2d34e_param_1,
14
+ .param .u64 triton__0d1d2d34e_param_2,
15
+ .param .u32 triton__0d1d2d34e_param_3,
16
+ .param .u32 triton__0d1d2d34e_param_4
17
+ )
18
+ .maxntid 64, 1, 1
19
+ {
20
+ .reg .pred %p<6>;
21
+ .reg .b32 %r<27>;
22
+ .reg .f32 %f<9>;
23
+ .reg .b64 %rd<24>;
24
+ .loc 1 18 0
25
+ $L__func_begin0:
26
+ .loc 1 18 0
27
+
28
+ ld.param.u64 %rd4, [triton__0d1d2d34e_param_0];
29
+ ld.param.u64 %rd5, [triton__0d1d2d34e_param_1];
30
+ $L__tmp0:
31
+ .loc 1 25 34
32
+ mov.u32 %r7, %tid.x;
33
+ and.b32 %r8, %r7, 7;
34
+ ld.param.u64 %rd6, [triton__0d1d2d34e_param_2];
35
+ .loc 1 28 30
36
+ mul.wide.u32 %rd7, %r8, 4;
37
+ add.s64 %rd1, %rd5, %rd7;
38
+ mov.b32 %r2, 0;
39
+ mov.pred %p1, -1;
40
+ .loc 1 28 35
41
+ mov.u32 %r1, 0x0;
42
+ @%p1 ld.global.b32 { %r1 }, [ %rd1 + 0 ];
43
+ @!%p1 mov.u32 %r1, %r2;
44
+ mov.b32 %f1, %r1;
45
+ .loc 1 29 30
46
+ mul.wide.u32 %rd8, %r8, 8;
47
+ add.s64 %rd3, %rd6, %rd8;
48
+ .loc 1 29 35
49
+ mov.u64 %rd2, 0x0;
50
+ @%p1 ld.global.b64 { %rd2 }, [ %rd3 + 0 ];
51
+ @!%p1 mov.u64 %rd2, 0x0;
52
+ $L__tmp1:
53
+ .loc 2 243 36
54
+ shfl.sync.bfly.b32 %r9, %r1, 4, 31, -1;
55
+ mov.b32 %f2, %r9;
56
+ $L__tmp2:
57
+ .loc 2 233 15
58
+ add.f32 %f3, %f1, %f2;
59
+ $L__tmp3:
60
+ .loc 2 243 36
61
+ mov.b32 %r10, %f3;
62
+ shfl.sync.bfly.b32 %r11, %r10, 2, 31, -1;
63
+ mov.b32 %f4, %r11;
64
+ $L__tmp4:
65
+ .loc 2 233 15
66
+ add.f32 %f5, %f3, %f4;
67
+ $L__tmp5:
68
+ .loc 2 243 36
69
+ mov.b32 %r12, %f5;
70
+ shfl.sync.bfly.b32 %r13, %r12, 1, 31, -1;
71
+ mov.b32 %f6, %r13;
72
+ $L__tmp6:
73
+ .loc 2 233 15
74
+ add.f32 %f7, %f5, %f6;
75
+ $L__tmp7:
76
+ .loc 2 243 36
77
+ cvt.u32.u64 %r14, %rd2;
78
+ shfl.sync.bfly.b32 %r15, %r14, 4, 31, -1;
79
+ { .reg .b32 tmp; mov.b64 {tmp, %r16}, %rd2; }
80
+ shfl.sync.bfly.b32 %r17, %r16, 4, 31, -1;
81
+ cvt.u64.u32 %rd9, %r15;
82
+ cvt.u64.u32 %rd10, %r17;
83
+ shl.b64 %rd11, %rd10, 32;
84
+ or.b64 %rd12, %rd9, %rd11;
85
+ $L__tmp8:
86
+ .loc 2 233 15
87
+ add.s64 %rd13, %rd2, %rd12;
88
+ $L__tmp9:
89
+ .loc 2 243 36
90
+ cvt.u32.u64 %r18, %rd13;
91
+ shfl.sync.bfly.b32 %r19, %r18, 2, 31, -1;
92
+ { .reg .b32 tmp; mov.b64 {tmp, %r20}, %rd13; }
93
+ shfl.sync.bfly.b32 %r21, %r20, 2, 31, -1;
94
+ cvt.u64.u32 %rd14, %r19;
95
+ cvt.u64.u32 %rd15, %r21;
96
+ shl.b64 %rd16, %rd15, 32;
97
+ or.b64 %rd17, %rd14, %rd16;
98
+ $L__tmp10:
99
+ .loc 2 233 15
100
+ add.s64 %rd18, %rd13, %rd17;
101
+ $L__tmp11:
102
+ .loc 2 243 36
103
+ cvt.u32.u64 %r22, %rd18;
104
+ shfl.sync.bfly.b32 %r23, %r22, 1, 31, -1;
105
+ { .reg .b32 tmp; mov.b64 {tmp, %r24}, %rd18; }
106
+ shfl.sync.bfly.b32 %r25, %r24, 1, 31, -1;
107
+ cvt.u64.u32 %rd19, %r23;
108
+ cvt.u64.u32 %rd20, %r25;
109
+ shl.b64 %rd21, %rd20, 32;
110
+ or.b64 %rd22, %rd19, %rd21;
111
+ $L__tmp12:
112
+ .loc 2 233 15
113
+ add.s64 %rd23, %rd18, %rd22;
114
+ $L__tmp13:
115
+ .loc 1 36 20
116
+ cvt.rn.f32.s64 %f8, %rd23;
117
+ .loc 1 37 19
118
+ mov.b32 %r4, %f7;
119
+ mov.b32 %r5, %f8;
120
+ div.full.f32 %r6, %r4, %r5;
121
+ .loc 1 38 4
122
+ bar.sync 0;
123
+ .loc 1 39 71
124
+ and.b32 %r26, %r7, 63;
125
+ setp.eq.s32 %p5, %r26, 0;
126
+ @%p5 st.global.b32 [ %rd4 + 0 ], { %r6 };
127
+ .loc 1 39 4
128
+ ret;
129
+ $L__tmp14:
130
+ $L__func_end0:
131
+
132
+ }
133
+ .file 1 "/tmp/torchinductor_root/7z/c7zrzealf5bsn7qskl6y72zb73mh5bzf6uskuswp33lv4y5kk64w.py"
134
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
135
+ .section .debug_abbrev
136
+ {
137
+ .b8 1
138
+ .b8 17
139
+ .b8 1
140
+ .b8 37
141
+ .b8 8
142
+ .b8 19
143
+ .b8 5
144
+ .b8 3
145
+ .b8 8
146
+ .b8 16
147
+ .b8 6
148
+ .b8 27
149
+ .b8 8
150
+ .b8 180
151
+ .b8 66
152
+ .b8 12
153
+ .b8 17
154
+ .b8 1
155
+ .b8 18
156
+ .b8 1
157
+ .b8 0
158
+ .b8 0
159
+ .b8 2
160
+ .b8 46
161
+ .b8 0
162
+ .b8 135
163
+ .b8 64
164
+ .b8 8
165
+ .b8 3
166
+ .b8 8
167
+ .b8 58
168
+ .b8 11
169
+ .b8 59
170
+ .b8 11
171
+ .b8 63
172
+ .b8 12
173
+ .b8 32
174
+ .b8 11
175
+ .b8 0
176
+ .b8 0
177
+ .b8 3
178
+ .b8 46
179
+ .b8 1
180
+ .b8 17
181
+ .b8 1
182
+ .b8 18
183
+ .b8 1
184
+ .b8 64
185
+ .b8 10
186
+ .b8 49
187
+ .b8 19
188
+ .b8 0
189
+ .b8 0
190
+ .b8 4
191
+ .b8 29
192
+ .b8 0
193
+ .b8 49
194
+ .b8 19
195
+ .b8 17
196
+ .b8 1
197
+ .b8 18
198
+ .b8 1
199
+ .b8 88
200
+ .b8 11
201
+ .b8 89
202
+ .b8 11
203
+ .b8 87
204
+ .b8 11
205
+ .b8 0
206
+ .b8 0
207
+ .b8 5
208
+ .b8 29
209
+ .b8 1
210
+ .b8 49
211
+ .b8 19
212
+ .b8 17
213
+ .b8 1
214
+ .b8 18
215
+ .b8 1
216
+ .b8 88
217
+ .b8 11
218
+ .b8 89
219
+ .b8 11
220
+ .b8 87
221
+ .b8 11
222
+ .b8 0
223
+ .b8 0
224
+ .b8 0
225
+ }
226
+ .section .debug_info
227
+ {
228
+ .b32 333
229
+ .b8 2
230
+ .b8 0
231
+ .b32 .debug_abbrev
232
+ .b8 8
233
+ .b8 1
234
+ .b8 116
235
+ .b8 114
236
+ .b8 105
237
+ .b8 116
238
+ .b8 111
239
+ .b8 110
240
+ .b8 0
241
+ .b8 2
242
+ .b8 0
243
+ .b8 99
244
+ .b8 55
245
+ .b8 122
246
+ .b8 114
247
+ .b8 122
248
+ .b8 101
249
+ .b8 97
250
+ .b8 108
251
+ .b8 102
252
+ .b8 53
253
+ .b8 98
254
+ .b8 115
255
+ .b8 110
256
+ .b8 55
257
+ .b8 113
258
+ .b8 115
259
+ .b8 107
260
+ .b8 108
261
+ .b8 54
262
+ .b8 121
263
+ .b8 55
264
+ .b8 50
265
+ .b8 122
266
+ .b8 98
267
+ .b8 55
268
+ .b8 51
269
+ .b8 109
270
+ .b8 104
271
+ .b8 53
272
+ .b8 98
273
+ .b8 122
274
+ .b8 102
275
+ .b8 54
276
+ .b8 117
277
+ .b8 115
278
+ .b8 107
279
+ .b8 117
280
+ .b8 115
281
+ .b8 119
282
+ .b8 112
283
+ .b8 51
284
+ .b8 51
285
+ .b8 108
286
+ .b8 118
287
+ .b8 52
288
+ .b8 121
289
+ .b8 53
290
+ .b8 107
291
+ .b8 107
292
+ .b8 54
293
+ .b8 52
294
+ .b8 119
295
+ .b8 46
296
+ .b8 112
297
+ .b8 121
298
+ .b8 0
299
+ .b32 .debug_line
300
+ .b8 47
301
+ .b8 116
302
+ .b8 109
303
+ .b8 112
304
+ .b8 47
305
+ .b8 116
306
+ .b8 111
307
+ .b8 114
308
+ .b8 99
309
+ .b8 104
310
+ .b8 105
311
+ .b8 110
312
+ .b8 100
313
+ .b8 117
314
+ .b8 99
315
+ .b8 116
316
+ .b8 111
317
+ .b8 114
318
+ .b8 95
319
+ .b8 114
320
+ .b8 111
321
+ .b8 111
322
+ .b8 116
323
+ .b8 47
324
+ .b8 55
325
+ .b8 122
326
+ .b8 0
327
+ .b8 1
328
+ .b64 $L__func_begin0
329
+ .b64 $L__func_end0
330
+ .b8 2
331
+ .b8 116
332
+ .b8 114
333
+ .b8 105
334
+ .b8 116
335
+ .b8 111
336
+ .b8 110
337
+ .b8 95
338
+ .b8 95
339
+ .b8 48
340
+ .b8 100
341
+ .b8 49
342
+ .b8 100
343
+ .b8 50
344
+ .b8 100
345
+ .b8 51
346
+ .b8 52
347
+ .b8 101
348
+ .b8 0
349
+ .b8 116
350
+ .b8 114
351
+ .b8 105
352
+ .b8 116
353
+ .b8 111
354
+ .b8 110
355
+ .b8 95
356
+ .b8 95
357
+ .b8 48
358
+ .b8 100
359
+ .b8 49
360
+ .b8 100
361
+ .b8 50
362
+ .b8 100
363
+ .b8 51
364
+ .b8 52
365
+ .b8 101
366
+ .b8 0
367
+ .b8 1
368
+ .b8 18
369
+ .b8 1
370
+ .b8 1
371
+ .b8 3
372
+ .b64 $L__func_begin0
373
+ .b64 $L__func_end0
374
+ .b8 1
375
+ .b8 156
376
+ .b32 125
377
+ .b8 4
378
+ .b32 125
379
+ .b64 $L__tmp1
380
+ .b64 $L__tmp6
381
+ .b8 2
382
+ .b8 32
383
+ .b8 24
384
+ .b8 5
385
+ .b32 125
386
+ .b64 $L__tmp2
387
+ .b64 $L__tmp7
388
+ .b8 2
389
+ .b8 32
390
+ .b8 24
391
+ .b8 4
392
+ .b32 125
393
+ .b64 $L__tmp2
394
+ .b64 $L__tmp7
395
+ .b8 2
396
+ .b8 243
397
+ .b8 36
398
+ .b8 0
399
+ .b8 4
400
+ .b32 125
401
+ .b64 $L__tmp7
402
+ .b64 $L__tmp12
403
+ .b8 2
404
+ .b8 35
405
+ .b8 24
406
+ .b8 5
407
+ .b32 125
408
+ .b64 $L__tmp8
409
+ .b64 $L__tmp13
410
+ .b8 2
411
+ .b8 35
412
+ .b8 24
413
+ .b8 4
414
+ .b32 125
415
+ .b64 $L__tmp8
416
+ .b64 $L__tmp13
417
+ .b8 2
418
+ .b8 243
419
+ .b8 36
420
+ .b8 0
421
+ .b8 0
422
+ .b8 0
423
+ }
424
+ .section .debug_pubnames
425
+ {
426
+ .b32 $L__pubNames_end0-$L__pubNames_start0
427
+ $L__pubNames_start0:
428
+ .b8 2
429
+ .b8 0
430
+ .b32 .debug_info
431
+ .b32 337
432
+ .b32 125
433
+ .b8 116
434
+ .b8 114
435
+ .b8 105
436
+ .b8 116
437
+ .b8 111
438
+ .b8 110
439
+ .b8 95
440
+ .b8 95
441
+ .b8 48
442
+ .b8 100
443
+ .b8 49
444
+ .b8 100
445
+ .b8 50
446
+ .b8 100
447
+ .b8 51
448
+ .b8 52
449
+ .b8 101
450
+ .b8 0
451
+ .b32 0
452
+ $L__pubNames_end0:
453
+ }
454
+ .section .debug_pubtypes
455
+ {
456
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
457
+ $L__pubTypes_start0:
458
+ .b8 2
459
+ .b8 0
460
+ .b32 .debug_info
461
+ .b32 337
462
+ .b32 0
463
+ $L__pubTypes_end0:
464
+ }
465
+ .section .debug_loc { }
.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.cubin ADDED
Binary file (29.9 kB). View file
 
.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.llir ADDED
@@ -0,0 +1,424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3d4d5d6d7de8(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i64 %7, i64 %8) local_unnamed_addr !dbg !5 {
7
+ %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %11 = lshr i32 %10, 5, !dbg !8
9
+ %urem = and i32 %10, 255, !dbg !8
10
+ %12 = or i32 %urem, 256, !dbg !8
11
+ %13 = or i32 %urem, 512, !dbg !8
12
+ %14 = or i32 %urem, 768, !dbg !8
13
+ %15 = or i32 %urem, 1024, !dbg !8
14
+ %16 = or i32 %urem, 1280, !dbg !8
15
+ %17 = or i32 %urem, 1536, !dbg !8
16
+ %18 = or i32 %urem, 1792, !dbg !8
17
+ %19 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9
18
+ %20 = sext i32 %19 to i64, !dbg !10
19
+ %21 = insertelement <8 x i32> poison, i32 %urem, i64 0
20
+ %22 = insertelement <8 x i32> %21, i32 %12, i64 1
21
+ %23 = insertelement <8 x i32> %22, i32 %13, i64 2
22
+ %24 = insertelement <8 x i32> %23, i32 %14, i64 3
23
+ %25 = insertelement <8 x i32> %24, i32 %15, i64 4
24
+ %26 = insertelement <8 x i32> %25, i32 %16, i64 5
25
+ %27 = insertelement <8 x i32> %26, i32 %17, i64 6
26
+ %28 = insertelement <8 x i32> %27, i32 %18, i64 7
27
+ %29 = zext <8 x i32> %28 to <8 x i64>
28
+ %30 = getelementptr i64, ptr addrspace(1) %1, i64 %20, !dbg !11
29
+ %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %30, i1 true) #3, !dbg !12
30
+ %32 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %2, i1 true) #3, !dbg !13
31
+ %33 = bitcast i32 %32 to float, !dbg !13
32
+ %34 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %3, i1 true) #3, !dbg !14
33
+ %35 = bitcast i32 %34 to float, !dbg !14
34
+ %36 = mul nsw i64 %20, 50257, !dbg !15
35
+ %.not = icmp eq i64 %31, -1, !dbg !16
36
+ %37 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %33, float %35) #3, !dbg !17
37
+ %38 = select i1 %.not, float 0.000000e+00, float %37, !dbg !18
38
+ %invariant.gep = getelementptr float, ptr addrspace(1) %0, i64 %36, !dbg !19
39
+ %39 = insertelement <8 x float> poison, float %38, i64 0, !dbg !20
40
+ %40 = shufflevector <8 x float> %39, <8 x float> poison, <8 x i32> zeroinitializer, !dbg !20
41
+ br label %41, !dbg !19
42
+
43
+ 41: ; preds = %9, %41
44
+ %42 = phi i32 [ 0, %9 ], [ %85, %41 ]
45
+ %43 = phi <8 x float> [ zeroinitializer, %9 ], [ %84, %41 ]
46
+ %44 = zext nneg i32 %42 to i64, !dbg !21
47
+ %45 = insertelement <8 x i64> poison, i64 %44, i64 0, !dbg !21
48
+ %46 = shufflevector <8 x i64> %45, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !21
49
+ %47 = or <8 x i64> %46, %29, !dbg !21
50
+ %48 = icmp ult <8 x i64> %47, <i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !22
51
+ %49 = extractelement <8 x i64> %47, i64 0, !dbg !23
52
+ %gep = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %49, !dbg !23
53
+ %50 = extractelement <8 x i64> %47, i64 1, !dbg !23
54
+ %gep3 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %50, !dbg !23
55
+ %51 = extractelement <8 x i64> %47, i64 2, !dbg !23
56
+ %gep5 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %51, !dbg !23
57
+ %52 = extractelement <8 x i64> %47, i64 3, !dbg !23
58
+ %gep7 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %52, !dbg !23
59
+ %53 = extractelement <8 x i64> %47, i64 4, !dbg !23
60
+ %gep9 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %53, !dbg !23
61
+ %54 = extractelement <8 x i64> %47, i64 5, !dbg !23
62
+ %gep11 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %54, !dbg !23
63
+ %55 = extractelement <8 x i64> %47, i64 6, !dbg !23
64
+ %gep13 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %55, !dbg !23
65
+ %56 = extractelement <8 x i64> %47, i64 7, !dbg !23
66
+ %gep15 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %56, !dbg !23
67
+ %57 = extractelement <8 x i1> %48, i64 0, !dbg !24
68
+ %58 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep, i1 %57, i32 0, i1 %57) #3, !dbg !24
69
+ %59 = extractelement <8 x i1> %48, i64 1, !dbg !24
70
+ %60 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep3, i1 %59, i32 0, i1 %59) #3, !dbg !24
71
+ %61 = extractelement <8 x i1> %48, i64 2, !dbg !24
72
+ %62 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep5, i1 %61, i32 0, i1 %61) #3, !dbg !24
73
+ %63 = extractelement <8 x i1> %48, i64 3, !dbg !24
74
+ %64 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep7, i1 %63, i32 0, i1 %63) #3, !dbg !24
75
+ %65 = extractelement <8 x i1> %48, i64 4, !dbg !24
76
+ %66 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep9, i1 %65, i32 0, i1 %65) #3, !dbg !24
77
+ %67 = extractelement <8 x i1> %48, i64 5, !dbg !24
78
+ %68 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep11, i1 %67, i32 0, i1 %67) #3, !dbg !24
79
+ %69 = extractelement <8 x i1> %48, i64 6, !dbg !24
80
+ %70 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep13, i1 %69, i32 0, i1 %69) #3, !dbg !24
81
+ %71 = extractelement <8 x i1> %48, i64 7, !dbg !24
82
+ %72 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep15, i1 %71, i32 0, i1 %71) #3, !dbg !24
83
+ %73 = insertelement <8 x i32> poison, i32 %58, i64 0, !dbg !24
84
+ %74 = insertelement <8 x i32> %73, i32 %60, i64 1, !dbg !24
85
+ %75 = insertelement <8 x i32> %74, i32 %62, i64 2, !dbg !24
86
+ %76 = insertelement <8 x i32> %75, i32 %64, i64 3, !dbg !24
87
+ %77 = insertelement <8 x i32> %76, i32 %66, i64 4, !dbg !24
88
+ %78 = insertelement <8 x i32> %77, i32 %68, i64 5, !dbg !24
89
+ %79 = insertelement <8 x i32> %78, i32 %70, i64 6, !dbg !24
90
+ %80 = insertelement <8 x i32> %79, i32 %72, i64 7, !dbg !24
91
+ %81 = bitcast <8 x i32> %80 to <8 x float>, !dbg !24
92
+ %82 = fmul <8 x float> %40, %81, !dbg !20
93
+ %83 = select <8 x i1> %48, <8 x float> %82, <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, !dbg !25
94
+ %84 = fadd <8 x float> %43, %83, !dbg !25
95
+ %85 = add nuw nsw i32 %42, 2048, !dbg !19
96
+ %86 = icmp ult i32 %42, 48209, !dbg !19
97
+ br i1 %86, label %41, label %87, !dbg !19
98
+
99
+ 87: ; preds = %41
100
+ %88 = and i32 %10, 31, !dbg !8
101
+ %89 = and i32 %11, 7, !dbg !8
102
+ %shift = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
103
+ %90 = fadd <8 x float> %84, %shift, !dbg !26
104
+ %shift37 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
105
+ %91 = fadd <8 x float> %shift37, %90, !dbg !26
106
+ %shift38 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
107
+ %92 = fadd <8 x float> %shift38, %91, !dbg !26
108
+ %shift39 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
109
+ %93 = fadd <8 x float> %shift39, %92, !dbg !26
110
+ %shift40 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
111
+ %94 = fadd <8 x float> %shift40, %93, !dbg !26
112
+ %shift41 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
113
+ %95 = fadd <8 x float> %shift41, %94, !dbg !26
114
+ %shift42 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
115
+ %96 = fadd <8 x float> %shift42, %95, !dbg !26
116
+ %97 = extractelement <8 x float> %96, i64 0, !dbg !26
117
+ %98 = bitcast float %97 to i32, !dbg !32
118
+ %99 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %98, i32 16, i32 31), !dbg !32
119
+ %100 = bitcast i32 %99 to float, !dbg !32
120
+ %101 = fadd float %97, %100, !dbg !26
121
+ %102 = bitcast float %101 to i32, !dbg !32
122
+ %103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 8, i32 31), !dbg !32
123
+ %104 = bitcast i32 %103 to float, !dbg !32
124
+ %105 = fadd float %101, %104, !dbg !26
125
+ %106 = bitcast float %105 to i32, !dbg !32
126
+ %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 4, i32 31), !dbg !32
127
+ %108 = bitcast i32 %107 to float, !dbg !32
128
+ %109 = fadd float %105, %108, !dbg !26
129
+ %110 = bitcast float %109 to i32, !dbg !32
130
+ %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 2, i32 31), !dbg !32
131
+ %112 = bitcast i32 %111 to float, !dbg !32
132
+ %113 = fadd float %109, %112, !dbg !26
133
+ %114 = bitcast float %113 to i32, !dbg !32
134
+ %115 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %114, i32 1, i32 31), !dbg !32
135
+ %116 = bitcast i32 %115 to float, !dbg !32
136
+ %117 = fadd float %113, %116, !dbg !26
137
+ %118 = icmp eq i32 %88, 0, !dbg !32
138
+ %119 = zext nneg i32 %89 to i64, !dbg !32
139
+ %120 = getelementptr float, ptr addrspace(3) @global_smem, i64 %119, !dbg !32
140
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %120, float %117, i1 %118) #3, !dbg !32
141
+ tail call void @llvm.nvvm.barrier0(), !dbg !32
142
+ %121 = icmp slt i32 %10, 8, !dbg !32
143
+ %122 = sext i32 %10 to i64, !dbg !32
144
+ %123 = getelementptr float, ptr addrspace(3) @global_smem, i64 %122, !dbg !32
145
+ %124 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %123, i1 %121) #3, !dbg !32
146
+ %125 = bitcast float %124 to i32, !dbg !32
147
+ %126 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %125, i32 4, i32 31), !dbg !32
148
+ %127 = bitcast i32 %126 to float, !dbg !32
149
+ %128 = fadd float %124, %127, !dbg !26
150
+ %129 = bitcast float %128 to i32, !dbg !32
151
+ %130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 2, i32 31), !dbg !32
152
+ %131 = bitcast i32 %130 to float, !dbg !32
153
+ %132 = fadd float %128, %131, !dbg !26
154
+ %133 = bitcast float %132 to i32, !dbg !32
155
+ %134 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %133, i32 1, i32 31), !dbg !32
156
+ %135 = bitcast i32 %134 to float, !dbg !32
157
+ %136 = fadd float %132, %135, !dbg !26
158
+ %137 = and i32 %10, 7, !dbg !32
159
+ %138 = icmp eq i32 %137, 0, !dbg !32
160
+ %139 = and i1 %121, %138, !dbg !32
161
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %123, float %136, i1 %139) #3, !dbg !32
162
+ tail call void @llvm.nvvm.barrier0(), !dbg !32
163
+ %140 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32
164
+ %141 = extractelement <8 x i64> %29, i64 0, !dbg !34
165
+ %142 = extractelement <8 x i64> %29, i64 1, !dbg !34
166
+ %143 = extractelement <8 x i64> %29, i64 2, !dbg !34
167
+ %144 = extractelement <8 x i64> %29, i64 3, !dbg !34
168
+ %145 = extractelement <8 x i64> %29, i64 4, !dbg !34
169
+ %146 = extractelement <8 x i64> %29, i64 5, !dbg !34
170
+ %147 = extractelement <8 x i64> %29, i64 6, !dbg !34
171
+ %148 = extractelement <8 x i64> %29, i64 7, !dbg !34
172
+ br label %149, !dbg !35
173
+
174
+ 149: ; preds = %87, %149
175
+ %150 = phi i32 [ 0, %87 ], [ %312, %149 ]
176
+ %151 = zext nneg i32 %150 to i64, !dbg !34
177
+ %152 = or i64 %141, %151, !dbg !34
178
+ %153 = or i64 %142, %151, !dbg !34
179
+ %154 = or i64 %143, %151, !dbg !34
180
+ %155 = or i64 %144, %151, !dbg !34
181
+ %156 = or i64 %145, %151, !dbg !34
182
+ %157 = or i64 %146, %151, !dbg !34
183
+ %158 = or i64 %147, %151, !dbg !34
184
+ %159 = or i64 %148, %151, !dbg !34
185
+ %160 = icmp ult i64 %152, 50257, !dbg !36
186
+ %161 = icmp ult i64 %153, 50257, !dbg !36
187
+ %162 = icmp ult i64 %154, 50257, !dbg !36
188
+ %163 = icmp ult i64 %155, 50257, !dbg !36
189
+ %164 = icmp ult i64 %156, 50257, !dbg !36
190
+ %165 = icmp ult i64 %157, 50257, !dbg !36
191
+ %166 = icmp ult i64 %158, 50257, !dbg !36
192
+ %167 = icmp ult i64 %159, 50257, !dbg !36
193
+ %168 = add nsw i64 %152, %36, !dbg !37
194
+ %169 = add nsw i64 %153, %36, !dbg !37
195
+ %170 = add nsw i64 %154, %36, !dbg !37
196
+ %171 = add nsw i64 %155, %36, !dbg !37
197
+ %172 = add nsw i64 %156, %36, !dbg !37
198
+ %173 = add nsw i64 %157, %36, !dbg !37
199
+ %174 = add nsw i64 %158, %36, !dbg !37
200
+ %175 = add nsw i64 %159, %36, !dbg !37
201
+ %176 = getelementptr i16, ptr addrspace(1) %4, i64 %168, !dbg !38
202
+ %177 = getelementptr i16, ptr addrspace(1) %4, i64 %169, !dbg !38
203
+ %178 = getelementptr i16, ptr addrspace(1) %4, i64 %170, !dbg !38
204
+ %179 = getelementptr i16, ptr addrspace(1) %4, i64 %171, !dbg !38
205
+ %180 = getelementptr i16, ptr addrspace(1) %4, i64 %172, !dbg !38
206
+ %181 = getelementptr i16, ptr addrspace(1) %4, i64 %173, !dbg !38
207
+ %182 = getelementptr i16, ptr addrspace(1) %4, i64 %174, !dbg !38
208
+ %183 = getelementptr i16, ptr addrspace(1) %4, i64 %175, !dbg !38
209
+ %184 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %176, i1 %160, i16 0, i1 %160) #3, !dbg !39
210
+ %185 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %177, i1 %161, i16 0, i1 %161) #3, !dbg !39
211
+ %186 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %178, i1 %162, i16 0, i1 %162) #3, !dbg !39
212
+ %187 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %179, i1 %163, i16 0, i1 %163) #3, !dbg !39
213
+ %188 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %180, i1 %164, i16 0, i1 %164) #3, !dbg !39
214
+ %189 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %181, i1 %165, i16 0, i1 %165) #3, !dbg !39
215
+ %190 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %182, i1 %166, i16 0, i1 %166) #3, !dbg !39
216
+ %191 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %183, i1 %167, i16 0, i1 %167) #3, !dbg !39
217
+ %192 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %184) #3, !dbg !40
218
+ %193 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %185) #3, !dbg !40
219
+ %194 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %186) #3, !dbg !40
220
+ %195 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %187) #3, !dbg !40
221
+ %196 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %188) #3, !dbg !40
222
+ %197 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %189) #3, !dbg !40
223
+ %198 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %190) #3, !dbg !40
224
+ %199 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %191) #3, !dbg !40
225
+ %200 = getelementptr float, ptr addrspace(1) %0, i64 %168, !dbg !41
226
+ %201 = getelementptr float, ptr addrspace(1) %0, i64 %169, !dbg !41
227
+ %202 = getelementptr float, ptr addrspace(1) %0, i64 %170, !dbg !41
228
+ %203 = getelementptr float, ptr addrspace(1) %0, i64 %171, !dbg !41
229
+ %204 = getelementptr float, ptr addrspace(1) %0, i64 %172, !dbg !41
230
+ %205 = getelementptr float, ptr addrspace(1) %0, i64 %173, !dbg !41
231
+ %206 = getelementptr float, ptr addrspace(1) %0, i64 %174, !dbg !41
232
+ %207 = getelementptr float, ptr addrspace(1) %0, i64 %175, !dbg !41
233
+ %208 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %200, i1 %160, i32 0, i1 %160) #3, !dbg !42
234
+ %209 = bitcast i32 %208 to float, !dbg !42
235
+ %210 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %201, i1 %161, i32 0, i1 %161) #3, !dbg !42
236
+ %211 = bitcast i32 %210 to float, !dbg !42
237
+ %212 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %202, i1 %162, i32 0, i1 %162) #3, !dbg !42
238
+ %213 = bitcast i32 %212 to float, !dbg !42
239
+ %214 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %203, i1 %163, i32 0, i1 %163) #3, !dbg !42
240
+ %215 = bitcast i32 %214 to float, !dbg !42
241
+ %216 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %204, i1 %164, i32 0, i1 %164) #3, !dbg !42
242
+ %217 = bitcast i32 %216 to float, !dbg !42
243
+ %218 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %205, i1 %165, i32 0, i1 %165) #3, !dbg !42
244
+ %219 = bitcast i32 %218 to float, !dbg !42
245
+ %220 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %206, i1 %166, i32 0, i1 %166) #3, !dbg !42
246
+ %221 = bitcast i32 %220 to float, !dbg !42
247
+ %222 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %207, i1 %167, i32 0, i1 %167) #3, !dbg !42
248
+ %223 = bitcast i32 %222 to float, !dbg !42
249
+ %224 = getelementptr i16, ptr addrspace(1) %5, i64 %168, !dbg !43
250
+ %225 = getelementptr i16, ptr addrspace(1) %5, i64 %169, !dbg !43
251
+ %226 = getelementptr i16, ptr addrspace(1) %5, i64 %170, !dbg !43
252
+ %227 = getelementptr i16, ptr addrspace(1) %5, i64 %171, !dbg !43
253
+ %228 = getelementptr i16, ptr addrspace(1) %5, i64 %172, !dbg !43
254
+ %229 = getelementptr i16, ptr addrspace(1) %5, i64 %173, !dbg !43
255
+ %230 = getelementptr i16, ptr addrspace(1) %5, i64 %174, !dbg !43
256
+ %231 = getelementptr i16, ptr addrspace(1) %5, i64 %175, !dbg !43
257
+ %232 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %224, i1 %160, i16 0, i1 %160) #3, !dbg !44
258
+ %233 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %225, i1 %161, i16 0, i1 %161) #3, !dbg !44
259
+ %234 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %226, i1 %162, i16 0, i1 %162) #3, !dbg !44
260
+ %235 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %227, i1 %163, i16 0, i1 %163) #3, !dbg !44
261
+ %236 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %228, i1 %164, i16 0, i1 %164) #3, !dbg !44
262
+ %237 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %229, i1 %165, i16 0, i1 %165) #3, !dbg !44
263
+ %238 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %230, i1 %166, i16 0, i1 %166) #3, !dbg !44
264
+ %239 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %231, i1 %167, i16 0, i1 %167) #3, !dbg !44
265
+ %240 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %232) #3, !dbg !45
266
+ %241 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %233) #3, !dbg !45
267
+ %242 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %234) #3, !dbg !45
268
+ %243 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %235) #3, !dbg !45
269
+ %244 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %236) #3, !dbg !45
270
+ %245 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %237) #3, !dbg !45
271
+ %246 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %238) #3, !dbg !45
272
+ %247 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %239) #3, !dbg !45
273
+ %248 = fmul float %38, %209, !dbg !46
274
+ %249 = fmul float %38, %211, !dbg !46
275
+ %250 = fmul float %38, %213, !dbg !46
276
+ %251 = fmul float %38, %215, !dbg !46
277
+ %252 = fmul float %38, %217, !dbg !46
278
+ %253 = fmul float %38, %219, !dbg !46
279
+ %254 = fmul float %38, %221, !dbg !46
280
+ %255 = fmul float %38, %223, !dbg !46
281
+ %256 = fmul float %240, 0x3FF7154760000000, !dbg !47
282
+ %257 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %256) #3, !dbg !47
283
+ %258 = fmul float %241, 0x3FF7154760000000, !dbg !47
284
+ %259 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %258) #3, !dbg !47
285
+ %260 = fmul float %242, 0x3FF7154760000000, !dbg !47
286
+ %261 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %260) #3, !dbg !47
287
+ %262 = fmul float %243, 0x3FF7154760000000, !dbg !47
288
+ %263 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %262) #3, !dbg !47
289
+ %264 = fmul float %244, 0x3FF7154760000000, !dbg !47
290
+ %265 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %264) #3, !dbg !47
291
+ %266 = fmul float %245, 0x3FF7154760000000, !dbg !47
292
+ %267 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %266) #3, !dbg !47
293
+ %268 = fmul float %246, 0x3FF7154760000000, !dbg !47
294
+ %269 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %268) #3, !dbg !47
295
+ %270 = fmul float %247, 0x3FF7154760000000, !dbg !47
296
+ %271 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %270) #3, !dbg !47
297
+ %272 = fmul float %140, %257, !dbg !48
298
+ %273 = fmul float %140, %259, !dbg !48
299
+ %274 = fmul float %140, %261, !dbg !48
300
+ %275 = fmul float %140, %263, !dbg !48
301
+ %276 = fmul float %140, %265, !dbg !48
302
+ %277 = fmul float %140, %267, !dbg !48
303
+ %278 = fmul float %140, %269, !dbg !48
304
+ %279 = fmul float %140, %271, !dbg !48
305
+ %280 = fsub float %248, %272, !dbg !49
306
+ %281 = fsub float %249, %273, !dbg !49
307
+ %282 = fsub float %250, %274, !dbg !49
308
+ %283 = fsub float %251, %275, !dbg !49
309
+ %284 = fsub float %252, %276, !dbg !49
310
+ %285 = fsub float %253, %277, !dbg !49
311
+ %286 = fsub float %254, %278, !dbg !49
312
+ %287 = fsub float %255, %279, !dbg !49
313
+ %288 = fadd float %192, %280, !dbg !50
314
+ %289 = fadd float %193, %281, !dbg !50
315
+ %290 = fadd float %194, %282, !dbg !50
316
+ %291 = fadd float %195, %283, !dbg !50
317
+ %292 = fadd float %196, %284, !dbg !50
318
+ %293 = fadd float %197, %285, !dbg !50
319
+ %294 = fadd float %198, %286, !dbg !50
320
+ %295 = fadd float %199, %287, !dbg !50
321
+ %296 = getelementptr i16, ptr addrspace(1) %6, i64 %168, !dbg !51
322
+ %297 = getelementptr i16, ptr addrspace(1) %6, i64 %169, !dbg !51
323
+ %298 = getelementptr i16, ptr addrspace(1) %6, i64 %170, !dbg !51
324
+ %299 = getelementptr i16, ptr addrspace(1) %6, i64 %171, !dbg !51
325
+ %300 = getelementptr i16, ptr addrspace(1) %6, i64 %172, !dbg !51
326
+ %301 = getelementptr i16, ptr addrspace(1) %6, i64 %173, !dbg !51
327
+ %302 = getelementptr i16, ptr addrspace(1) %6, i64 %174, !dbg !51
328
+ %303 = getelementptr i16, ptr addrspace(1) %6, i64 %175, !dbg !51
329
+ %304 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %288) #3, !dbg !52
330
+ %305 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %289) #3, !dbg !52
331
+ %306 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %290) #3, !dbg !52
332
+ %307 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %291) #3, !dbg !52
333
+ %308 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %292) #3, !dbg !52
334
+ %309 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %293) #3, !dbg !52
335
+ %310 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %294) #3, !dbg !52
336
+ %311 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %295) #3, !dbg !52
337
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %304, ptr addrspace(1) %296, i1 %160) #3, !dbg !52
338
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %305, ptr addrspace(1) %297, i1 %161) #3, !dbg !52
339
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %306, ptr addrspace(1) %298, i1 %162) #3, !dbg !52
340
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %307, ptr addrspace(1) %299, i1 %163) #3, !dbg !52
341
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %308, ptr addrspace(1) %300, i1 %164) #3, !dbg !52
342
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %309, ptr addrspace(1) %301, i1 %165) #3, !dbg !52
343
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %310, ptr addrspace(1) %302, i1 %166) #3, !dbg !52
344
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %311, ptr addrspace(1) %303, i1 %167) #3, !dbg !52
345
+ %312 = add nuw nsw i32 %150, 2048, !dbg !35
346
+ %313 = icmp ult i32 %150, 48209, !dbg !35
347
+ br i1 %313, label %149, label %314, !dbg !35
348
+
349
+ 314: ; preds = %149
350
+ ret void, !dbg !53
351
+ }
352
+
353
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
354
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
355
+
356
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
357
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
358
+
359
+ ; Function Attrs: convergent nocallback nounwind
360
+ declare void @llvm.nvvm.barrier0() #2
361
+
362
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
363
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
364
+ attributes #2 = { convergent nocallback nounwind }
365
+ attributes #3 = { nounwind }
366
+
367
+ !llvm.module.flags = !{!0}
368
+ !llvm.dbg.cu = !{!1}
369
+ !nvvm.annotations = !{!3, !4, !4, !3}
370
+
371
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
372
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
373
+ !2 = !DIFile(filename: "ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py", directory: "/tmp/torchinductor_root/kz")
374
+ !3 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"kernel", i32 1}
375
+ !4 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"maxntidx", i32 256}
376
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8", linkageName: "triton__0d1d2d3d4d5d6d7de8", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
377
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
378
+ !7 = !{}
379
+ !8 = !DILocation(line: 24, column: 33, scope: !5)
380
+ !9 = !DILocation(line: 21, column: 28, scope: !5)
381
+ !10 = !DILocation(line: 21, column: 34, scope: !5)
382
+ !11 = !DILocation(line: 26, column: 30, scope: !5)
383
+ !12 = !DILocation(line: 26, column: 35, scope: !5)
384
+ !13 = !DILocation(line: 27, column: 19, scope: !5)
385
+ !14 = !DILocation(line: 29, column: 19, scope: !5)
386
+ !15 = !DILocation(line: 36, column: 46, scope: !5)
387
+ !16 = !DILocation(line: 38, column: 23, scope: !5)
388
+ !17 = !DILocation(line: 39, column: 22, scope: !5)
389
+ !18 = !DILocation(line: 41, column: 37, scope: !5)
390
+ !19 = !DILocation(line: 32, column: 36, scope: !5)
391
+ !20 = !DILocation(line: 42, column: 23, scope: !5)
392
+ !21 = !DILocation(line: 33, column: 27, scope: !5)
393
+ !22 = !DILocation(line: 34, column: 25, scope: !5)
394
+ !23 = !DILocation(line: 36, column: 34, scope: !5)
395
+ !24 = !DILocation(line: 36, column: 52, scope: !5)
396
+ !25 = !DILocation(line: 45, column: 40, scope: !5)
397
+ !26 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !30)
398
+ !27 = distinct !DILexicalBlockFile(scope: !29, file: !28, discriminator: 0)
399
+ !28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
400
+ !29 = distinct !DILexicalBlockFile(scope: !5, file: !28, discriminator: 0)
401
+ !30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31)
402
+ !31 = !DILocation(line: 46, column: 27, scope: !27)
403
+ !32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33)
404
+ !33 = !DILocation(line: 46, column: 27, scope: !29)
405
+ !34 = !DILocation(line: 52, column: 27, scope: !5)
406
+ !35 = !DILocation(line: 51, column: 36, scope: !5)
407
+ !36 = !DILocation(line: 53, column: 25, scope: !5)
408
+ !37 = !DILocation(line: 55, column: 41, scope: !5)
409
+ !38 = !DILocation(line: 55, column: 35, scope: !5)
410
+ !39 = !DILocation(line: 55, column: 53, scope: !5)
411
+ !40 = !DILocation(line: 55, column: 105, scope: !5)
412
+ !41 = !DILocation(line: 56, column: 35, scope: !5)
413
+ !42 = !DILocation(line: 56, column: 53, scope: !5)
414
+ !43 = !DILocation(line: 57, column: 35, scope: !5)
415
+ !44 = !DILocation(line: 57, column: 53, scope: !5)
416
+ !45 = !DILocation(line: 57, column: 105, scope: !5)
417
+ !46 = !DILocation(line: 63, column: 24, scope: !5)
418
+ !47 = !DILocation(line: 65, column: 23, scope: !5)
419
+ !48 = !DILocation(line: 66, column: 24, scope: !5)
420
+ !49 = !DILocation(line: 67, column: 24, scope: !5)
421
+ !50 = !DILocation(line: 69, column: 24, scope: !5)
422
+ !51 = !DILocation(line: 70, column: 29, scope: !5)
423
+ !52 = !DILocation(line: 70, column: 54, scope: !5)
424
+ !53 = !DILocation(line: 51, column: 4, scope: !5)
.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.ptx ADDED
@@ -0,0 +1,921 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6d7de8
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3d4d5d6d7de8(
13
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_0,
14
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_1,
15
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_2,
16
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_3,
17
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_4,
18
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_5,
19
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_6,
20
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_7,
21
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_8
22
+ )
23
+ .maxntid 256, 1, 1
24
+ {
25
+ .reg .pred %p<83>;
26
+ .reg .b16 %rs<65>;
27
+ .reg .b32 %r<104>;
28
+ .reg .f32 %f<164>;
29
+ .reg .b64 %rd<126>;
30
+ .loc 1 18 0
31
+ $L__func_begin0:
32
+ .loc 1 18 0
33
+
34
+ ld.param.u64 %rd25, [triton__0d1d2d3d4d5d6d7de8_param_6];
35
+ ld.param.u64 %rd24, [triton__0d1d2d3d4d5d6d7de8_param_5];
36
+ ld.param.u64 %rd23, [triton__0d1d2d3d4d5d6d7de8_param_4];
37
+ ld.param.u64 %rd31, [triton__0d1d2d3d4d5d6d7de8_param_0];
38
+ $L__tmp0:
39
+ .loc 1 24 33
40
+ mov.u32 %r1, %tid.x;
41
+ ld.param.u64 %rd32, [triton__0d1d2d3d4d5d6d7de8_param_1];
42
+ shr.u32 %r2, %r1, 5;
43
+ ld.param.u64 %rd28, [triton__0d1d2d3d4d5d6d7de8_param_2];
44
+ and.b32 %r9, %r1, 255;
45
+ ld.param.u64 %rd29, [triton__0d1d2d3d4d5d6d7de8_param_3];
46
+ or.b32 %r10, %r9, 256;
47
+ or.b32 %r11, %r9, 512;
48
+ or.b32 %r12, %r9, 768;
49
+ or.b32 %r13, %r9, 1024;
50
+ or.b32 %r14, %r9, 1280;
51
+ or.b32 %r15, %r9, 1536;
52
+ or.b32 %r16, %r9, 1792;
53
+ .loc 1 21 28
54
+ mov.u32 %r3, %ctaid.x;
55
+ cvt.u64.u32 %rd1, %r9;
56
+ cvt.u64.u32 %rd8, %r16;
57
+ cvt.u64.u32 %rd7, %r15;
58
+ cvt.u64.u32 %rd6, %r14;
59
+ cvt.u64.u32 %rd5, %r13;
60
+ cvt.u64.u32 %rd4, %r12;
61
+ cvt.u64.u32 %rd3, %r11;
62
+ cvt.u64.u32 %rd2, %r10;
63
+ .loc 1 26 30
64
+ mul.wide.s32 %rd33, %r3, 8;
65
+ add.s64 %rd27, %rd32, %rd33;
66
+ mov.pred %p1, -1;
67
+ .loc 1 26 35
68
+ mov.u64 %rd26, 0x0;
69
+ @%p1 ld.global.L1::evict_last.b64 { %rd26 }, [ %rd27 + 0 ];
70
+ .loc 1 27 19
71
+ mov.u32 %r7, 0x0;
72
+ @%p1 ld.global.b32 { %r7 }, [ %rd28 + 0 ];
73
+ .loc 1 29 19
74
+ mov.u32 %r8, 0x0;
75
+ @%p1 ld.global.b32 { %r8 }, [ %rd29 + 0 ];
76
+ .loc 1 36 46
77
+ mul.wide.s32 %rd9, %r3, 50257;
78
+ .loc 1 38 23
79
+ setp.eq.s64 %p4, %rd26, -1;
80
+ .loc 1 39 22
81
+ div.full.f32 %r6, %r7, %r8;
82
+ mov.b32 %f35, %r6;
83
+ .loc 1 41 37
84
+ selp.f32 %f2, 0f00000000, %f35, %p4;
85
+ .loc 1 32 36
86
+ shl.b64 %rd34, %rd9, 2;
87
+ add.s64 %rd10, %rd31, %rd34;
88
+ mov.f32 %f156, 0f00000000;
89
+ mov.u64 %rd124, 0;
90
+ mov.f32 %f157, %f156;
91
+ mov.f32 %f158, %f156;
92
+ mov.f32 %f159, %f156;
93
+ mov.f32 %f160, %f156;
94
+ mov.f32 %f161, %f156;
95
+ mov.f32 %f162, %f156;
96
+ mov.f32 %f163, %f156;
97
+ $L__BB0_1:
98
+ .loc 1 33 27
99
+ or.b64 %rd43, %rd124, %rd1;
100
+ or.b64 %rd44, %rd124, %rd2;
101
+ or.b64 %rd45, %rd124, %rd3;
102
+ or.b64 %rd46, %rd124, %rd4;
103
+ or.b64 %rd47, %rd124, %rd5;
104
+ or.b64 %rd48, %rd124, %rd6;
105
+ or.b64 %rd49, %rd124, %rd7;
106
+ or.b64 %rd50, %rd124, %rd8;
107
+ .loc 1 34 25
108
+ setp.lt.u64 %p20, %rd50, 50257;
109
+ setp.lt.u64 %p18, %rd49, 50257;
110
+ setp.lt.u64 %p16, %rd48, 50257;
111
+ setp.lt.u64 %p14, %rd47, 50257;
112
+ setp.lt.u64 %p12, %rd46, 50257;
113
+ setp.lt.u64 %p10, %rd45, 50257;
114
+ setp.lt.u64 %p8, %rd44, 50257;
115
+ setp.lt.u64 %p6, %rd43, 50257;
116
+ .loc 1 36 34
117
+ shl.b64 %rd51, %rd43, 2;
118
+ add.s64 %rd35, %rd10, %rd51;
119
+ shl.b64 %rd52, %rd44, 2;
120
+ add.s64 %rd36, %rd10, %rd52;
121
+ shl.b64 %rd53, %rd45, 2;
122
+ add.s64 %rd37, %rd10, %rd53;
123
+ shl.b64 %rd54, %rd46, 2;
124
+ add.s64 %rd38, %rd10, %rd54;
125
+ shl.b64 %rd55, %rd47, 2;
126
+ add.s64 %rd39, %rd10, %rd55;
127
+ shl.b64 %rd56, %rd48, 2;
128
+ add.s64 %rd40, %rd10, %rd56;
129
+ shl.b64 %rd57, %rd49, 2;
130
+ add.s64 %rd41, %rd10, %rd57;
131
+ shl.b64 %rd58, %rd50, 2;
132
+ add.s64 %rd42, %rd10, %rd58;
133
+ mov.b32 %r71, 0;
134
+ .loc 1 36 52
135
+ mov.u32 %r17, 0x0;
136
+ @%p6 ld.global.L1::evict_last.b32 { %r17 }, [ %rd35 + 0 ];
137
+ @!%p6 mov.u32 %r17, %r71;
138
+ mov.u32 %r19, 0x0;
139
+ @%p8 ld.global.L1::evict_last.b32 { %r19 }, [ %rd36 + 0 ];
140
+ @!%p8 mov.u32 %r19, %r71;
141
+ mov.u32 %r21, 0x0;
142
+ @%p10 ld.global.L1::evict_last.b32 { %r21 }, [ %rd37 + 0 ];
143
+ @!%p10 mov.u32 %r21, %r71;
144
+ mov.u32 %r23, 0x0;
145
+ @%p12 ld.global.L1::evict_last.b32 { %r23 }, [ %rd38 + 0 ];
146
+ @!%p12 mov.u32 %r23, %r71;
147
+ mov.u32 %r25, 0x0;
148
+ @%p14 ld.global.L1::evict_last.b32 { %r25 }, [ %rd39 + 0 ];
149
+ @!%p14 mov.u32 %r25, %r71;
150
+ mov.u32 %r27, 0x0;
151
+ @%p16 ld.global.L1::evict_last.b32 { %r27 }, [ %rd40 + 0 ];
152
+ @!%p16 mov.u32 %r27, %r71;
153
+ mov.u32 %r29, 0x0;
154
+ @%p18 ld.global.L1::evict_last.b32 { %r29 }, [ %rd41 + 0 ];
155
+ @!%p18 mov.u32 %r29, %r71;
156
+ mov.u32 %r31, 0x0;
157
+ @%p20 ld.global.L1::evict_last.b32 { %r31 }, [ %rd42 + 0 ];
158
+ @!%p20 mov.u32 %r31, %r71;
159
+ mov.b32 %f36, %r31;
160
+ mov.b32 %f37, %r29;
161
+ mov.b32 %f38, %r27;
162
+ mov.b32 %f39, %r25;
163
+ mov.b32 %f40, %r23;
164
+ mov.b32 %f41, %r21;
165
+ mov.b32 %f42, %r19;
166
+ mov.b32 %f43, %r17;
167
+ .loc 1 42 23
168
+ mul.f32 %f44, %f2, %f43;
169
+ mul.f32 %f45, %f2, %f42;
170
+ mul.f32 %f46, %f2, %f41;
171
+ mul.f32 %f47, %f2, %f40;
172
+ mul.f32 %f48, %f2, %f39;
173
+ mul.f32 %f49, %f2, %f38;
174
+ mul.f32 %f50, %f2, %f37;
175
+ mul.f32 %f51, %f2, %f36;
176
+ .loc 1 45 40
177
+ selp.f32 %f52, %f51, 0f80000000, %p20;
178
+ selp.f32 %f53, %f50, 0f80000000, %p18;
179
+ selp.f32 %f54, %f49, 0f80000000, %p16;
180
+ selp.f32 %f55, %f48, 0f80000000, %p14;
181
+ selp.f32 %f56, %f47, 0f80000000, %p12;
182
+ selp.f32 %f57, %f46, 0f80000000, %p10;
183
+ selp.f32 %f58, %f45, 0f80000000, %p8;
184
+ selp.f32 %f59, %f44, 0f80000000, %p6;
185
+ add.f32 %f156, %f156, %f59;
186
+ add.f32 %f157, %f157, %f58;
187
+ add.f32 %f158, %f158, %f57;
188
+ add.f32 %f159, %f159, %f56;
189
+ add.f32 %f160, %f160, %f55;
190
+ add.f32 %f161, %f161, %f54;
191
+ add.f32 %f162, %f162, %f53;
192
+ add.f32 %f163, %f163, %f52;
193
+ .loc 1 32 36
194
+ add.s64 %rd124, %rd124, 2048;
195
+ cvt.u32.u64 %r33, %rd124;
196
+ add.s32 %r34, %r33, -2048;
197
+ setp.lt.u32 %p21, %r34, 48209;
198
+ @%p21 bra $L__BB0_1;
199
+ .loc 1 24 33
200
+ and.b32 %r41, %r1, 31;
201
+ and.b32 %r42, %r2, 7;
202
+ $L__tmp1:
203
+ .loc 2 233 15
204
+ add.f32 %f60, %f156, %f157;
205
+ add.f32 %f61, %f158, %f60;
206
+ add.f32 %f62, %f159, %f61;
207
+ add.f32 %f63, %f160, %f62;
208
+ add.f32 %f64, %f161, %f63;
209
+ add.f32 %f65, %f162, %f64;
210
+ add.f32 %f66, %f163, %f65;
211
+ $L__tmp2:
212
+ .loc 2 243 36
213
+ mov.b32 %r43, %f66;
214
+ shfl.sync.bfly.b32 %r44, %r43, 16, 31, -1;
215
+ mov.b32 %f67, %r44;
216
+ $L__tmp3:
217
+ .loc 2 233 15
218
+ add.f32 %f68, %f66, %f67;
219
+ $L__tmp4:
220
+ .loc 2 243 36
221
+ mov.b32 %r45, %f68;
222
+ shfl.sync.bfly.b32 %r46, %r45, 8, 31, -1;
223
+ mov.b32 %f69, %r46;
224
+ $L__tmp5:
225
+ .loc 2 233 15
226
+ add.f32 %f70, %f68, %f69;
227
+ $L__tmp6:
228
+ .loc 2 243 36
229
+ mov.b32 %r47, %f70;
230
+ shfl.sync.bfly.b32 %r48, %r47, 4, 31, -1;
231
+ mov.b32 %f71, %r48;
232
+ $L__tmp7:
233
+ .loc 2 233 15
234
+ add.f32 %f72, %f70, %f71;
235
+ $L__tmp8:
236
+ .loc 2 243 36
237
+ mov.b32 %r49, %f72;
238
+ shfl.sync.bfly.b32 %r50, %r49, 2, 31, -1;
239
+ mov.b32 %f73, %r50;
240
+ $L__tmp9:
241
+ .loc 2 233 15
242
+ add.f32 %f74, %f72, %f73;
243
+ $L__tmp10:
244
+ .loc 2 243 36
245
+ mov.b32 %r51, %f74;
246
+ shfl.sync.bfly.b32 %r52, %r51, 1, 31, -1;
247
+ mov.b32 %f75, %r52;
248
+ $L__tmp11:
249
+ .loc 2 233 15
250
+ add.f32 %f76, %f74, %f75;
251
+ $L__tmp12:
252
+ .loc 2 243 36
253
+ setp.eq.s32 %p22, %r41, 0;
254
+ shl.b32 %r53, %r42, 2;
255
+ mov.u32 %r54, global_smem;
256
+ add.s32 %r35, %r54, %r53;
257
+ mov.b32 %r36, %f76;
258
+ @%p22 st.shared.b32 [ %r35 + 0 ], %r36;
259
+ bar.sync 0;
260
+ setp.lt.s32 %p23, %r1, 8;
261
+ shl.b32 %r55, %r1, 2;
262
+ add.s32 %r38, %r54, %r55;
263
+ @%p23 ld.shared.b32 %r37, [ %r38 + 0 ];
264
+ mov.b32 %f77, %r37;
265
+ shfl.sync.bfly.b32 %r56, %r37, 4, 31, -1;
266
+ mov.b32 %f78, %r56;
267
+ $L__tmp13:
268
+ .loc 2 233 15
269
+ add.f32 %f79, %f77, %f78;
270
+ $L__tmp14:
271
+ .loc 2 243 36
272
+ mov.b32 %r57, %f79;
273
+ shfl.sync.bfly.b32 %r58, %r57, 2, 31, -1;
274
+ mov.b32 %f80, %r58;
275
+ $L__tmp15:
276
+ .loc 2 233 15
277
+ add.f32 %f81, %f79, %f80;
278
+ $L__tmp16:
279
+ .loc 2 243 36
280
+ mov.b32 %r59, %f81;
281
+ shfl.sync.bfly.b32 %r60, %r59, 1, 31, -1;
282
+ mov.b32 %f82, %r60;
283
+ $L__tmp17:
284
+ .loc 2 233 15
285
+ add.f32 %f83, %f81, %f82;
286
+ $L__tmp18:
287
+ .loc 2 243 36
288
+ and.b32 %r61, %r1, 7;
289
+ setp.eq.s32 %p25, %r61, 0;
290
+ and.pred %p24, %p23, %p25;
291
+ mov.b32 %r40, %f83;
292
+ @%p24 st.shared.b32 [ %r38 + 0 ], %r40;
293
+ bar.sync 0;
294
+ ld.shared.f32 %f26, [global_smem];
295
+ mov.u64 %rd125, 0;
296
+ mov.u16 %rs2, 0;
297
+ $L__tmp19:
298
+ $L__BB0_3:
299
+ .loc 1 52 27
300
+ or.b64 %rd92, %rd1, %rd125;
301
+ or.b64 %rd93, %rd2, %rd125;
302
+ or.b64 %rd94, %rd3, %rd125;
303
+ or.b64 %rd95, %rd4, %rd125;
304
+ or.b64 %rd96, %rd5, %rd125;
305
+ or.b64 %rd97, %rd6, %rd125;
306
+ or.b64 %rd98, %rd7, %rd125;
307
+ or.b64 %rd99, %rd8, %rd125;
308
+ .loc 1 53 25
309
+ setp.lt.u64 %p26, %rd92, 50257;
310
+ setp.lt.u64 %p28, %rd93, 50257;
311
+ setp.lt.u64 %p30, %rd94, 50257;
312
+ setp.lt.u64 %p32, %rd95, 50257;
313
+ setp.lt.u64 %p34, %rd96, 50257;
314
+ setp.lt.u64 %p36, %rd97, 50257;
315
+ setp.lt.u64 %p38, %rd98, 50257;
316
+ setp.lt.u64 %p40, %rd99, 50257;
317
+ .loc 1 55 41
318
+ add.s64 %rd100, %rd92, %rd9;
319
+ add.s64 %rd101, %rd93, %rd9;
320
+ add.s64 %rd102, %rd94, %rd9;
321
+ add.s64 %rd103, %rd95, %rd9;
322
+ add.s64 %rd104, %rd96, %rd9;
323
+ add.s64 %rd105, %rd97, %rd9;
324
+ add.s64 %rd106, %rd98, %rd9;
325
+ add.s64 %rd107, %rd99, %rd9;
326
+ .loc 1 55 35
327
+ shl.b64 %rd108, %rd100, 1;
328
+ add.s64 %rd60, %rd23, %rd108;
329
+ shl.b64 %rd109, %rd101, 1;
330
+ add.s64 %rd61, %rd23, %rd109;
331
+ shl.b64 %rd110, %rd102, 1;
332
+ add.s64 %rd62, %rd23, %rd110;
333
+ shl.b64 %rd111, %rd103, 1;
334
+ add.s64 %rd63, %rd23, %rd111;
335
+ shl.b64 %rd112, %rd104, 1;
336
+ add.s64 %rd64, %rd23, %rd112;
337
+ shl.b64 %rd113, %rd105, 1;
338
+ add.s64 %rd65, %rd23, %rd113;
339
+ shl.b64 %rd114, %rd106, 1;
340
+ add.s64 %rd66, %rd23, %rd114;
341
+ shl.b64 %rd115, %rd107, 1;
342
+ add.s64 %rd67, %rd23, %rd115;
343
+ .loc 1 55 53
344
+ mov.u16 %rs1, 0x0;
345
+ @%p26 ld.global.L1::evict_first.b16 { %rs1 }, [ %rd60 + 0 ];
346
+ @!%p26 mov.u16 %rs1, %rs2;
347
+ mov.u16 %rs3, 0x0;
348
+ @%p28 ld.global.L1::evict_first.b16 { %rs3 }, [ %rd61 + 0 ];
349
+ @!%p28 mov.u16 %rs3, %rs2;
350
+ mov.u16 %rs5, 0x0;
351
+ @%p30 ld.global.L1::evict_first.b16 { %rs5 }, [ %rd62 + 0 ];
352
+ @!%p30 mov.u16 %rs5, %rs2;
353
+ mov.u16 %rs7, 0x0;
354
+ @%p32 ld.global.L1::evict_first.b16 { %rs7 }, [ %rd63 + 0 ];
355
+ @!%p32 mov.u16 %rs7, %rs2;
356
+ mov.u16 %rs9, 0x0;
357
+ @%p34 ld.global.L1::evict_first.b16 { %rs9 }, [ %rd64 + 0 ];
358
+ @!%p34 mov.u16 %rs9, %rs2;
359
+ mov.u16 %rs11, 0x0;
360
+ @%p36 ld.global.L1::evict_first.b16 { %rs11 }, [ %rd65 + 0 ];
361
+ @!%p36 mov.u16 %rs11, %rs2;
362
+ mov.u16 %rs13, 0x0;
363
+ @%p38 ld.global.L1::evict_first.b16 { %rs13 }, [ %rd66 + 0 ];
364
+ @!%p38 mov.u16 %rs13, %rs2;
365
+ mov.u16 %rs15, 0x0;
366
+ @%p40 ld.global.L1::evict_first.b16 { %rs15 }, [ %rd67 + 0 ];
367
+ @!%p40 mov.u16 %rs15, %rs2;
368
+ .loc 1 55 105
369
+ cvt.f32.bf16 %r62, %rs1;
370
+ mov.b32 %f100, %r62;
371
+ cvt.f32.bf16 %r63, %rs3;
372
+ mov.b32 %f101, %r63;
373
+ cvt.f32.bf16 %r64, %rs5;
374
+ mov.b32 %f102, %r64;
375
+ cvt.f32.bf16 %r65, %rs7;
376
+ mov.b32 %f103, %r65;
377
+ cvt.f32.bf16 %r66, %rs9;
378
+ mov.b32 %f104, %r66;
379
+ cvt.f32.bf16 %r67, %rs11;
380
+ mov.b32 %f105, %r67;
381
+ cvt.f32.bf16 %r68, %rs13;
382
+ mov.b32 %f106, %r68;
383
+ cvt.f32.bf16 %r69, %rs15;
384
+ mov.b32 %f107, %r69;
385
+ .loc 1 56 35
386
+ shl.b64 %rd116, %rd92, 2;
387
+ add.s64 %rd68, %rd10, %rd116;
388
+ shl.b64 %rd117, %rd93, 2;
389
+ add.s64 %rd69, %rd10, %rd117;
390
+ shl.b64 %rd118, %rd94, 2;
391
+ add.s64 %rd70, %rd10, %rd118;
392
+ shl.b64 %rd119, %rd95, 2;
393
+ add.s64 %rd71, %rd10, %rd119;
394
+ shl.b64 %rd120, %rd96, 2;
395
+ add.s64 %rd72, %rd10, %rd120;
396
+ shl.b64 %rd121, %rd97, 2;
397
+ add.s64 %rd73, %rd10, %rd121;
398
+ shl.b64 %rd122, %rd98, 2;
399
+ add.s64 %rd74, %rd10, %rd122;
400
+ shl.b64 %rd123, %rd99, 2;
401
+ add.s64 %rd75, %rd10, %rd123;
402
+ .loc 1 56 53
403
+ mov.u32 %r70, 0x0;
404
+ @%p26 ld.global.L1::evict_first.b32 { %r70 }, [ %rd68 + 0 ];
405
+ @!%p26 mov.u32 %r70, %r71;
406
+ mov.b32 %f108, %r70;
407
+ mov.u32 %r72, 0x0;
408
+ @%p28 ld.global.L1::evict_first.b32 { %r72 }, [ %rd69 + 0 ];
409
+ @!%p28 mov.u32 %r72, %r71;
410
+ mov.b32 %f109, %r72;
411
+ mov.u32 %r74, 0x0;
412
+ @%p30 ld.global.L1::evict_first.b32 { %r74 }, [ %rd70 + 0 ];
413
+ @!%p30 mov.u32 %r74, %r71;
414
+ mov.b32 %f110, %r74;
415
+ mov.u32 %r76, 0x0;
416
+ @%p32 ld.global.L1::evict_first.b32 { %r76 }, [ %rd71 + 0 ];
417
+ @!%p32 mov.u32 %r76, %r71;
418
+ mov.b32 %f111, %r76;
419
+ mov.u32 %r78, 0x0;
420
+ @%p34 ld.global.L1::evict_first.b32 { %r78 }, [ %rd72 + 0 ];
421
+ @!%p34 mov.u32 %r78, %r71;
422
+ mov.b32 %f112, %r78;
423
+ mov.u32 %r80, 0x0;
424
+ @%p36 ld.global.L1::evict_first.b32 { %r80 }, [ %rd73 + 0 ];
425
+ @!%p36 mov.u32 %r80, %r71;
426
+ mov.b32 %f113, %r80;
427
+ mov.u32 %r82, 0x0;
428
+ @%p38 ld.global.L1::evict_first.b32 { %r82 }, [ %rd74 + 0 ];
429
+ @!%p38 mov.u32 %r82, %r71;
430
+ mov.b32 %f114, %r82;
431
+ mov.u32 %r84, 0x0;
432
+ @%p40 ld.global.L1::evict_first.b32 { %r84 }, [ %rd75 + 0 ];
433
+ @!%p40 mov.u32 %r84, %r71;
434
+ mov.b32 %f115, %r84;
435
+ .loc 1 57 35
436
+ add.s64 %rd76, %rd24, %rd108;
437
+ add.s64 %rd77, %rd24, %rd109;
438
+ add.s64 %rd78, %rd24, %rd110;
439
+ add.s64 %rd79, %rd24, %rd111;
440
+ add.s64 %rd80, %rd24, %rd112;
441
+ add.s64 %rd81, %rd24, %rd113;
442
+ add.s64 %rd82, %rd24, %rd114;
443
+ add.s64 %rd83, %rd24, %rd115;
444
+ .loc 1 57 53
445
+ mov.u16 %rs25, 0x0;
446
+ @%p26 ld.global.L1::evict_first.b16 { %rs25 }, [ %rd76 + 0 ];
447
+ @!%p26 mov.u16 %rs25, %rs2;
448
+ mov.u16 %rs27, 0x0;
449
+ @%p28 ld.global.L1::evict_first.b16 { %rs27 }, [ %rd77 + 0 ];
450
+ @!%p28 mov.u16 %rs27, %rs2;
451
+ mov.u16 %rs29, 0x0;
452
+ @%p30 ld.global.L1::evict_first.b16 { %rs29 }, [ %rd78 + 0 ];
453
+ @!%p30 mov.u16 %rs29, %rs2;
454
+ mov.u16 %rs31, 0x0;
455
+ @%p32 ld.global.L1::evict_first.b16 { %rs31 }, [ %rd79 + 0 ];
456
+ @!%p32 mov.u16 %rs31, %rs2;
457
+ mov.u16 %rs33, 0x0;
458
+ @%p34 ld.global.L1::evict_first.b16 { %rs33 }, [ %rd80 + 0 ];
459
+ @!%p34 mov.u16 %rs33, %rs2;
460
+ mov.u16 %rs35, 0x0;
461
+ @%p36 ld.global.L1::evict_first.b16 { %rs35 }, [ %rd81 + 0 ];
462
+ @!%p36 mov.u16 %rs35, %rs2;
463
+ mov.u16 %rs37, 0x0;
464
+ @%p38 ld.global.L1::evict_first.b16 { %rs37 }, [ %rd82 + 0 ];
465
+ @!%p38 mov.u16 %rs37, %rs2;
466
+ mov.u16 %rs39, 0x0;
467
+ @%p40 ld.global.L1::evict_first.b16 { %rs39 }, [ %rd83 + 0 ];
468
+ @!%p40 mov.u16 %rs39, %rs2;
469
+ .loc 1 57 105
470
+ cvt.f32.bf16 %r86, %rs25;
471
+ mov.b32 %f116, %r86;
472
+ cvt.f32.bf16 %r87, %rs27;
473
+ mov.b32 %f117, %r87;
474
+ cvt.f32.bf16 %r88, %rs29;
475
+ mov.b32 %f118, %r88;
476
+ cvt.f32.bf16 %r89, %rs31;
477
+ mov.b32 %f119, %r89;
478
+ cvt.f32.bf16 %r90, %rs33;
479
+ mov.b32 %f120, %r90;
480
+ cvt.f32.bf16 %r91, %rs35;
481
+ mov.b32 %f121, %r91;
482
+ cvt.f32.bf16 %r92, %rs37;
483
+ mov.b32 %f122, %r92;
484
+ cvt.f32.bf16 %r93, %rs39;
485
+ mov.b32 %f123, %r93;
486
+ .loc 1 65 23
487
+ mul.f32 %f85, %f116, 0f3FB8AA3B;
488
+ ex2.approx.f32 %f84, %f85;
489
+ mul.f32 %f87, %f117, 0f3FB8AA3B;
490
+ ex2.approx.f32 %f86, %f87;
491
+ mul.f32 %f89, %f118, 0f3FB8AA3B;
492
+ ex2.approx.f32 %f88, %f89;
493
+ mul.f32 %f91, %f119, 0f3FB8AA3B;
494
+ ex2.approx.f32 %f90, %f91;
495
+ mul.f32 %f93, %f120, 0f3FB8AA3B;
496
+ ex2.approx.f32 %f92, %f93;
497
+ mul.f32 %f95, %f121, 0f3FB8AA3B;
498
+ ex2.approx.f32 %f94, %f95;
499
+ mul.f32 %f97, %f122, 0f3FB8AA3B;
500
+ ex2.approx.f32 %f96, %f97;
501
+ mul.f32 %f99, %f123, 0f3FB8AA3B;
502
+ ex2.approx.f32 %f98, %f99;
503
+ .loc 1 66 24
504
+ mul.f32 %f124, %f26, %f84;
505
+ mul.f32 %f125, %f26, %f86;
506
+ mul.f32 %f126, %f26, %f88;
507
+ mul.f32 %f127, %f26, %f90;
508
+ mul.f32 %f128, %f26, %f92;
509
+ mul.f32 %f129, %f26, %f94;
510
+ mul.f32 %f130, %f26, %f96;
511
+ mul.f32 %f131, %f26, %f98;
512
+ .loc 1 67 24
513
+ neg.f32 %f132, %f124;
514
+ fma.rn.f32 %f133, %f2, %f108, %f132;
515
+ neg.f32 %f134, %f125;
516
+ fma.rn.f32 %f135, %f2, %f109, %f134;
517
+ neg.f32 %f136, %f126;
518
+ fma.rn.f32 %f137, %f2, %f110, %f136;
519
+ neg.f32 %f138, %f127;
520
+ fma.rn.f32 %f139, %f2, %f111, %f138;
521
+ neg.f32 %f140, %f128;
522
+ fma.rn.f32 %f141, %f2, %f112, %f140;
523
+ neg.f32 %f142, %f129;
524
+ fma.rn.f32 %f143, %f2, %f113, %f142;
525
+ neg.f32 %f144, %f130;
526
+ fma.rn.f32 %f145, %f2, %f114, %f144;
527
+ neg.f32 %f146, %f131;
528
+ fma.rn.f32 %f147, %f2, %f115, %f146;
529
+ .loc 1 69 24
530
+ add.f32 %f148, %f100, %f133;
531
+ add.f32 %f149, %f101, %f135;
532
+ add.f32 %f150, %f102, %f137;
533
+ add.f32 %f151, %f103, %f139;
534
+ add.f32 %f152, %f104, %f141;
535
+ add.f32 %f153, %f105, %f143;
536
+ add.f32 %f154, %f106, %f145;
537
+ add.f32 %f155, %f107, %f147;
538
+ .loc 1 70 29
539
+ add.s64 %rd84, %rd25, %rd108;
540
+ add.s64 %rd85, %rd25, %rd109;
541
+ add.s64 %rd86, %rd25, %rd110;
542
+ add.s64 %rd87, %rd25, %rd111;
543
+ add.s64 %rd88, %rd25, %rd112;
544
+ add.s64 %rd89, %rd25, %rd113;
545
+ add.s64 %rd90, %rd25, %rd114;
546
+ add.s64 %rd91, %rd25, %rd115;
547
+ .loc 1 70 54
548
+ mov.b32 %r94, %f148;
549
+ cvt.rn.bf16.f32 %rs49, %r94;
550
+ mov.b32 %r95, %f149;
551
+ cvt.rn.bf16.f32 %rs50, %r95;
552
+ mov.b32 %r96, %f150;
553
+ cvt.rn.bf16.f32 %rs51, %r96;
554
+ mov.b32 %r97, %f151;
555
+ cvt.rn.bf16.f32 %rs52, %r97;
556
+ mov.b32 %r98, %f152;
557
+ cvt.rn.bf16.f32 %rs53, %r98;
558
+ mov.b32 %r99, %f153;
559
+ cvt.rn.bf16.f32 %rs54, %r99;
560
+ mov.b32 %r100, %f154;
561
+ cvt.rn.bf16.f32 %rs55, %r100;
562
+ mov.b32 %r101, %f155;
563
+ cvt.rn.bf16.f32 %rs56, %r101;
564
+ @%p26 st.global.b16 [ %rd84 + 0 ], { %rs49 };
565
+ @%p28 st.global.b16 [ %rd85 + 0 ], { %rs50 };
566
+ @%p30 st.global.b16 [ %rd86 + 0 ], { %rs51 };
567
+ @%p32 st.global.b16 [ %rd87 + 0 ], { %rs52 };
568
+ @%p34 st.global.b16 [ %rd88 + 0 ], { %rs53 };
569
+ @%p36 st.global.b16 [ %rd89 + 0 ], { %rs54 };
570
+ @%p38 st.global.b16 [ %rd90 + 0 ], { %rs55 };
571
+ @%p40 st.global.b16 [ %rd91 + 0 ], { %rs56 };
572
+ .loc 1 51 36
573
+ add.s64 %rd125, %rd125, 2048;
574
+ cvt.u32.u64 %r102, %rd125;
575
+ add.s32 %r103, %r102, -2048;
576
+ setp.lt.u32 %p82, %r103, 48209;
577
+ @%p82 bra $L__BB0_3;
578
+ .loc 1 51 4
579
+ ret;
580
+ $L__tmp20:
581
+ $L__func_end0:
582
+
583
+ }
584
+ .file 1 "/tmp/torchinductor_root/kz/ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py"
585
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
586
+ .section .debug_abbrev
587
+ {
588
+ .b8 1
589
+ .b8 17
590
+ .b8 1
591
+ .b8 37
592
+ .b8 8
593
+ .b8 19
594
+ .b8 5
595
+ .b8 3
596
+ .b8 8
597
+ .b8 16
598
+ .b8 6
599
+ .b8 27
600
+ .b8 8
601
+ .b8 180
602
+ .b8 66
603
+ .b8 12
604
+ .b8 17
605
+ .b8 1
606
+ .b8 18
607
+ .b8 1
608
+ .b8 0
609
+ .b8 0
610
+ .b8 2
611
+ .b8 46
612
+ .b8 0
613
+ .b8 135
614
+ .b8 64
615
+ .b8 8
616
+ .b8 3
617
+ .b8 8
618
+ .b8 58
619
+ .b8 11
620
+ .b8 59
621
+ .b8 11
622
+ .b8 63
623
+ .b8 12
624
+ .b8 32
625
+ .b8 11
626
+ .b8 0
627
+ .b8 0
628
+ .b8 3
629
+ .b8 46
630
+ .b8 1
631
+ .b8 17
632
+ .b8 1
633
+ .b8 18
634
+ .b8 1
635
+ .b8 64
636
+ .b8 10
637
+ .b8 49
638
+ .b8 19
639
+ .b8 0
640
+ .b8 0
641
+ .b8 4
642
+ .b8 29
643
+ .b8 1
644
+ .b8 49
645
+ .b8 19
646
+ .b8 17
647
+ .b8 1
648
+ .b8 18
649
+ .b8 1
650
+ .b8 88
651
+ .b8 11
652
+ .b8 89
653
+ .b8 11
654
+ .b8 87
655
+ .b8 11
656
+ .b8 0
657
+ .b8 0
658
+ .b8 5
659
+ .b8 29
660
+ .b8 0
661
+ .b8 49
662
+ .b8 19
663
+ .b8 17
664
+ .b8 1
665
+ .b8 18
666
+ .b8 1
667
+ .b8 88
668
+ .b8 11
669
+ .b8 89
670
+ .b8 11
671
+ .b8 87
672
+ .b8 11
673
+ .b8 0
674
+ .b8 0
675
+ .b8 0
676
+ }
677
+ .section .debug_info
678
+ {
679
+ .b32 278
680
+ .b8 2
681
+ .b8 0
682
+ .b32 .debug_abbrev
683
+ .b8 8
684
+ .b8 1
685
+ .b8 116
686
+ .b8 114
687
+ .b8 105
688
+ .b8 116
689
+ .b8 111
690
+ .b8 110
691
+ .b8 0
692
+ .b8 2
693
+ .b8 0
694
+ .b8 99
695
+ .b8 107
696
+ .b8 122
697
+ .b8 103
698
+ .b8 108
699
+ .b8 55
700
+ .b8 116
701
+ .b8 104
702
+ .b8 98
703
+ .b8 52
704
+ .b8 120
705
+ .b8 100
706
+ .b8 102
707
+ .b8 107
708
+ .b8 102
709
+ .b8 110
710
+ .b8 100
711
+ .b8 50
712
+ .b8 116
713
+ .b8 105
714
+ .b8 100
715
+ .b8 107
716
+ .b8 115
717
+ .b8 54
718
+ .b8 109
719
+ .b8 116
720
+ .b8 53
721
+ .b8 102
722
+ .b8 51
723
+ .b8 104
724
+ .b8 97
725
+ .b8 117
726
+ .b8 119
727
+ .b8 102
728
+ .b8 121
729
+ .b8 106
730
+ .b8 102
731
+ .b8 108
732
+ .b8 98
733
+ .b8 116
734
+ .b8 122
735
+ .b8 121
736
+ .b8 101
737
+ .b8 112
738
+ .b8 111
739
+ .b8 53
740
+ .b8 111
741
+ .b8 120
742
+ .b8 107
743
+ .b8 118
744
+ .b8 104
745
+ .b8 107
746
+ .b8 46
747
+ .b8 112
748
+ .b8 121
749
+ .b8 0
750
+ .b32 .debug_line
751
+ .b8 47
752
+ .b8 116
753
+ .b8 109
754
+ .b8 112
755
+ .b8 47
756
+ .b8 116
757
+ .b8 111
758
+ .b8 114
759
+ .b8 99
760
+ .b8 104
761
+ .b8 105
762
+ .b8 110
763
+ .b8 100
764
+ .b8 117
765
+ .b8 99
766
+ .b8 116
767
+ .b8 111
768
+ .b8 114
769
+ .b8 95
770
+ .b8 114
771
+ .b8 111
772
+ .b8 111
773
+ .b8 116
774
+ .b8 47
775
+ .b8 107
776
+ .b8 122
777
+ .b8 0
778
+ .b8 1
779
+ .b64 $L__func_begin0
780
+ .b64 $L__func_end0
781
+ .b8 2
782
+ .b8 116
783
+ .b8 114
784
+ .b8 105
785
+ .b8 116
786
+ .b8 111
787
+ .b8 110
788
+ .b8 95
789
+ .b8 95
790
+ .b8 48
791
+ .b8 100
792
+ .b8 49
793
+ .b8 100
794
+ .b8 50
795
+ .b8 100
796
+ .b8 51
797
+ .b8 100
798
+ .b8 52
799
+ .b8 100
800
+ .b8 53
801
+ .b8 100
802
+ .b8 54
803
+ .b8 100
804
+ .b8 55
805
+ .b8 100
806
+ .b8 101
807
+ .b8 56
808
+ .b8 0
809
+ .b8 116
810
+ .b8 114
811
+ .b8 105
812
+ .b8 116
813
+ .b8 111
814
+ .b8 110
815
+ .b8 95
816
+ .b8 95
817
+ .b8 48
818
+ .b8 100
819
+ .b8 49
820
+ .b8 100
821
+ .b8 50
822
+ .b8 100
823
+ .b8 51
824
+ .b8 100
825
+ .b8 52
826
+ .b8 100
827
+ .b8 53
828
+ .b8 100
829
+ .b8 54
830
+ .b8 100
831
+ .b8 55
832
+ .b8 100
833
+ .b8 101
834
+ .b8 56
835
+ .b8 0
836
+ .b8 1
837
+ .b8 18
838
+ .b8 1
839
+ .b8 1
840
+ .b8 3
841
+ .b64 $L__func_begin0
842
+ .b64 $L__func_end0
843
+ .b8 1
844
+ .b8 156
845
+ .b32 125
846
+ .b8 4
847
+ .b32 125
848
+ .b64 $L__tmp1
849
+ .b64 $L__tmp18
850
+ .b8 2
851
+ .b8 46
852
+ .b8 27
853
+ .b8 5
854
+ .b32 125
855
+ .b64 $L__tmp1
856
+ .b64 $L__tmp18
857
+ .b8 2
858
+ .b8 243
859
+ .b8 36
860
+ .b8 0
861
+ .b8 5
862
+ .b32 125
863
+ .b64 $L__tmp2
864
+ .b64 $L__tmp19
865
+ .b8 2
866
+ .b8 46
867
+ .b8 27
868
+ .b8 0
869
+ .b8 0
870
+ }
871
+ .section .debug_pubnames
872
+ {
873
+ .b32 $L__pubNames_end0-$L__pubNames_start0
874
+ $L__pubNames_start0:
875
+ .b8 2
876
+ .b8 0
877
+ .b32 .debug_info
878
+ .b32 282
879
+ .b32 125
880
+ .b8 116
881
+ .b8 114
882
+ .b8 105
883
+ .b8 116
884
+ .b8 111
885
+ .b8 110
886
+ .b8 95
887
+ .b8 95
888
+ .b8 48
889
+ .b8 100
890
+ .b8 49
891
+ .b8 100
892
+ .b8 50
893
+ .b8 100
894
+ .b8 51
895
+ .b8 100
896
+ .b8 52
897
+ .b8 100
898
+ .b8 53
899
+ .b8 100
900
+ .b8 54
901
+ .b8 100
902
+ .b8 55
903
+ .b8 100
904
+ .b8 101
905
+ .b8 56
906
+ .b8 0
907
+ .b32 0
908
+ $L__pubNames_end0:
909
+ }
910
+ .section .debug_pubtypes
911
+ {
912
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
913
+ $L__pubTypes_start0:
914
+ .b8 2
915
+ .b8 0
916
+ .b32 .debug_info
917
+ .b32 282
918
+ .b32 0
919
+ $L__pubTypes_end0:
920
+ }
921
+ .section .debug_loc { }
.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.ttgir ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} {
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<1x1xf32, #blocked>
5
+ %cst_0 = arith.constant dense<-1> : tensor<1x1xi64, #blocked>
6
+ %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked>
7
+ %cst_2 = arith.constant dense<50257> : tensor<1x2048xi64, #blocked>
8
+ %c0_i32 = arith.constant 0 : i32
9
+ %c2048_i32 = arith.constant 2048 : i32
10
+ %c50257_i32 = arith.constant 50257 : i32
11
+ %c50257_i64 = arith.constant 50257 : i64
12
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked>
13
+ %0 = tt.get_program_id x : i32
14
+ %1 = arith.extsi %0 : i32 to i64
15
+ %2 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
16
+ %3 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x2048xi32, #blocked>
17
+ %4 = arith.extsi %3 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked>
18
+ %5 = tt.addptr %arg1, %1 : !tt.ptr<i64, 1>, i64
19
+ %6 = tt.splat %5 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked>
20
+ %7 = tt.load %6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x1xi64, #blocked>
21
+ %8 = tt.addptr %arg2, %c0_i32 : !tt.ptr<f32, 1>, i32
22
+ %9 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
23
+ %10 = tt.addptr %arg3, %c0_i32 : !tt.ptr<f32, 1>, i32
24
+ %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
25
+ %12 = arith.muli %1, %c50257_i64 : i64
26
+ %13 = tt.splat %12 : (i64) -> tensor<1x2048xi64, #blocked>
27
+ %14 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1x2048x!tt.ptr<f32, 1>, #blocked>
28
+ %15 = arith.cmpi ne, %7, %cst_0 : tensor<1x1xi64, #blocked>
29
+ %16 = arith.divf %9, %11 : f32
30
+ %17 = tt.splat %16 : (f32) -> tensor<1x1xf32, #blocked>
31
+ %18 = arith.select %15, %17, %cst : tensor<1x1xi1, #blocked>, tensor<1x1xf32, #blocked>
32
+ %19 = tt.broadcast %18 : (tensor<1x1xf32, #blocked>) -> tensor<1x2048xf32, #blocked>
33
+ %20 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c2048_i32 iter_args(%arg10 = %cst_1) -> (tensor<1x2048xf32, #blocked>) : i32 {
34
+ %27 = arith.extsi %arg9 : i32 to i64
35
+ %28 = tt.splat %27 : (i64) -> tensor<1x2048xi64, #blocked>
36
+ %29 = arith.addi %28, %4 : tensor<1x2048xi64, #blocked>
37
+ %30 = arith.cmpi slt, %29, %cst_2 : tensor<1x2048xi64, #blocked>
38
+ %31 = arith.addi %29, %13 : tensor<1x2048xi64, #blocked>
39
+ %32 = tt.addptr %14, %31 : tensor<1x2048x!tt.ptr<f32, 1>, #blocked>, tensor<1x2048xi64, #blocked>
40
+ %33 = tt.load %32, %30, %cst_1 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x2048xf32, #blocked>
41
+ %34 = arith.mulf %33, %19 : tensor<1x2048xf32, #blocked>
42
+ %35 = arith.addf %arg10, %34 : tensor<1x2048xf32, #blocked>
43
+ %36 = arith.select %30, %35, %arg10 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked>
44
+ scf.yield %36 : tensor<1x2048xf32, #blocked>
45
+ }
46
+ %21 = "tt.reduce"(%20) <{axis = 1 : i32}> ({
47
+ ^bb0(%arg9: f32, %arg10: f32):
48
+ %27 = arith.addf %arg9, %arg10 : f32
49
+ tt.reduce.return %27 : f32
50
+ }) : (tensor<1x2048xf32, #blocked>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
51
+ %22 = tt.expand_dims %21 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xf32, #blocked>
52
+ %23 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>
53
+ %24 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>
54
+ %25 = tt.broadcast %22 : (tensor<1x1xf32, #blocked>) -> tensor<1x2048xf32, #blocked>
55
+ %26 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>
56
+ scf.for %arg9 = %c0_i32 to %c50257_i32 step %c2048_i32 : i32 {
57
+ %27 = arith.extsi %arg9 : i32 to i64
58
+ %28 = tt.splat %27 : (i64) -> tensor<1x2048xi64, #blocked>
59
+ %29 = arith.addi %28, %4 : tensor<1x2048xi64, #blocked>
60
+ %30 = arith.cmpi slt, %29, %cst_2 : tensor<1x2048xi64, #blocked>
61
+ %31 = arith.addi %29, %13 : tensor<1x2048xi64, #blocked>
62
+ %32 = tt.addptr %23, %31 : tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>, tensor<1x2048xi64, #blocked>
63
+ %33 = tt.load %32, %30, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xbf16, #blocked>
64
+ %34 = arith.extf %33 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked>
65
+ %35 = tt.addptr %14, %31 : tensor<1x2048x!tt.ptr<f32, 1>, #blocked>, tensor<1x2048xi64, #blocked>
66
+ %36 = tt.load %35, %30, %cst_1 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xf32, #blocked>
67
+ %37 = tt.addptr %24, %31 : tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>, tensor<1x2048xi64, #blocked>
68
+ %38 = tt.load %37, %30, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xbf16, #blocked>
69
+ %39 = arith.extf %38 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked>
70
+ %40 = arith.mulf %36, %19 : tensor<1x2048xf32, #blocked>
71
+ %41 = math.exp %39 : tensor<1x2048xf32, #blocked>
72
+ %42 = arith.mulf %41, %25 : tensor<1x2048xf32, #blocked>
73
+ %43 = arith.subf %40, %42 : tensor<1x2048xf32, #blocked>
74
+ %44 = arith.addf %34, %43 : tensor<1x2048xf32, #blocked>
75
+ %45 = tt.addptr %26, %31 : tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>, tensor<1x2048xi64, #blocked>
76
+ %46 = arith.truncf %44 : tensor<1x2048xf32, #blocked> to tensor<1x2048xbf16, #blocked>
77
+ tt.store %45, %46, %30 {cache = 1 : i32, evict = 1 : i32} : tensor<1x2048xbf16, #blocked>
78
+ }
79
+ tt.return
80
+ }
81
+ }
.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.ttir ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16>
4
+ %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x1xf32>
5
+ %c50257_i64 = arith.constant 50257 : i64
6
+ %c50257_i32 = arith.constant 50257 : i32
7
+ %c2048_i32 = arith.constant 2048 : i32
8
+ %c0_i32 = arith.constant 0 : i32
9
+ %cst_1 = arith.constant dense<50257> : tensor<1x2048xi64>
10
+ %cst_2 = arith.constant dense<-1> : tensor<1x1xi64>
11
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32>
12
+ %0 = tt.get_program_id x : i32
13
+ %1 = arith.extsi %0 : i32 to i64
14
+ %2 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32>
15
+ %3 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<2048xi32>) -> tensor<1x2048xi32>
16
+ %4 = arith.extsi %3 : tensor<1x2048xi32> to tensor<1x2048xi64>
17
+ %5 = tt.addptr %arg1, %1 : !tt.ptr<i64, 1>, i64
18
+ %6 = tt.splat %5 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>>
19
+ %7 = tt.load %6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x1xi64>
20
+ %8 = tt.addptr %arg2, %c0_i32 : !tt.ptr<f32, 1>, i32
21
+ %9 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
22
+ %10 = tt.addptr %arg3, %c0_i32 : !tt.ptr<f32, 1>, i32
23
+ %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
24
+ %12 = arith.muli %1, %c50257_i64 : i64
25
+ %13 = tt.splat %12 : (i64) -> tensor<1x2048xi64>
26
+ %14 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1x2048x!tt.ptr<f32, 1>>
27
+ %15 = arith.cmpi ne, %7, %cst_2 : tensor<1x1xi64>
28
+ %16 = arith.divf %9, %11 : f32
29
+ %17 = tt.splat %16 : (f32) -> tensor<1x1xf32>
30
+ %18 = arith.select %15, %17, %cst_0 : tensor<1x1xi1>, tensor<1x1xf32>
31
+ %19 = tt.broadcast %18 : (tensor<1x1xf32>) -> tensor<1x2048xf32>
32
+ %20 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c2048_i32 iter_args(%arg10 = %cst_3) -> (tensor<1x2048xf32>) : i32 {
33
+ %35 = arith.extsi %arg9 : i32 to i64
34
+ %36 = tt.splat %35 : (i64) -> tensor<1x2048xi64>
35
+ %37 = arith.addi %36, %4 : tensor<1x2048xi64>
36
+ %38 = arith.cmpi slt, %37, %cst_1 : tensor<1x2048xi64>
37
+ %39 = arith.addi %37, %13 : tensor<1x2048xi64>
38
+ %40 = tt.addptr %14, %39 : tensor<1x2048x!tt.ptr<f32, 1>>, tensor<1x2048xi64>
39
+ %41 = tt.load %40, %38, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x2048xf32>
40
+ %42 = arith.mulf %41, %19 : tensor<1x2048xf32>
41
+ %43 = arith.addf %arg10, %42 : tensor<1x2048xf32>
42
+ %44 = arith.select %38, %43, %arg10 : tensor<1x2048xi1>, tensor<1x2048xf32>
43
+ scf.yield %44 : tensor<1x2048xf32>
44
+ }
45
+ %21 = "tt.reduce"(%20) <{axis = 1 : i32}> ({
46
+ ^bb0(%arg9: f32, %arg10: f32):
47
+ %35 = arith.addf %arg9, %arg10 : f32
48
+ tt.reduce.return %35 : f32
49
+ }) : (tensor<1x2048xf32>) -> tensor<1xf32>
50
+ %22 = tt.expand_dims %21 {axis = 1 : i32} : (tensor<1xf32>) -> tensor<1x1xf32>
51
+ %23 = arith.muli %1, %c50257_i64 : i64
52
+ %24 = tt.splat %23 : (i64) -> tensor<1x2048xi64>
53
+ %25 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>>
54
+ %26 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1x2048x!tt.ptr<f32, 1>>
55
+ %27 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>>
56
+ %28 = arith.cmpi ne, %7, %cst_2 : tensor<1x1xi64>
57
+ %29 = arith.divf %9, %11 : f32
58
+ %30 = tt.splat %29 : (f32) -> tensor<1x1xf32>
59
+ %31 = arith.select %28, %30, %cst_0 : tensor<1x1xi1>, tensor<1x1xf32>
60
+ %32 = tt.broadcast %31 : (tensor<1x1xf32>) -> tensor<1x2048xf32>
61
+ %33 = tt.broadcast %22 : (tensor<1x1xf32>) -> tensor<1x2048xf32>
62
+ %34 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>>
63
+ scf.for %arg9 = %c0_i32 to %c50257_i32 step %c2048_i32 : i32 {
64
+ %35 = arith.extsi %arg9 : i32 to i64
65
+ %36 = tt.splat %35 : (i64) -> tensor<1x2048xi64>
66
+ %37 = arith.addi %36, %4 : tensor<1x2048xi64>
67
+ %38 = arith.cmpi slt, %37, %cst_1 : tensor<1x2048xi64>
68
+ %39 = arith.addi %37, %24 : tensor<1x2048xi64>
69
+ %40 = tt.addptr %25, %39 : tensor<1x2048x!tt.ptr<bf16, 1>>, tensor<1x2048xi64>
70
+ %41 = tt.load %40, %38, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xbf16>
71
+ %42 = arith.extf %41 : tensor<1x2048xbf16> to tensor<1x2048xf32>
72
+ %43 = tt.addptr %26, %39 : tensor<1x2048x!tt.ptr<f32, 1>>, tensor<1x2048xi64>
73
+ %44 = tt.load %43, %38, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xf32>
74
+ %45 = tt.addptr %27, %39 : tensor<1x2048x!tt.ptr<bf16, 1>>, tensor<1x2048xi64>
75
+ %46 = tt.load %45, %38, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xbf16>
76
+ %47 = arith.extf %46 : tensor<1x2048xbf16> to tensor<1x2048xf32>
77
+ %48 = arith.mulf %44, %32 : tensor<1x2048xf32>
78
+ %49 = math.exp %47 : tensor<1x2048xf32>
79
+ %50 = arith.mulf %49, %33 : tensor<1x2048xf32>
80
+ %51 = arith.subf %48, %50 : tensor<1x2048xf32>
81
+ %52 = arith.addf %42, %51 : tensor<1x2048xf32>
82
+ %53 = tt.addptr %34, %39 : tensor<1x2048x!tt.ptr<bf16, 1>>, tensor<1x2048xi64>
83
+ %54 = arith.truncf %52 : tensor<1x2048xf32> to tensor<1x2048xbf16>
84
+ tt.store %53, %54, %38 {cache = 1 : i32, evict = 1 : i32} : tensor<1x2048xbf16>
85
+ }
86
+ tt.return
87
+ }
88
+ }
.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.cubin ADDED
Binary file (5.16 kB). View file
 
.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.llir ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
5
+ %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
6
+ %5 = shl i32 %4, 1, !dbg !8
7
+ %6 = and i32 %5, 510, !dbg !8
8
+ %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
9
+ %8 = shl i32 %7, 9, !dbg !10
10
+ %9 = or i32 %8, %6, !dbg !11
11
+ %10 = icmp slt i32 %9, 12865792, !dbg !12
12
+ %11 = sext i32 %9 to i64, !dbg !13
13
+ %12 = getelementptr i16, ptr addrspace(1) %0, i64 %11, !dbg !13
14
+ %13 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %12, i1 %10) #1, !dbg !14
15
+ %14 = trunc i32 %13 to i16, !dbg !14
16
+ %extelt.offset = lshr i32 %13, 16, !dbg !14
17
+ %15 = trunc i32 %extelt.offset to i16, !dbg !14
18
+ %16 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %14) #1, !dbg !15
19
+ %17 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %15) #1, !dbg !15
20
+ %18 = getelementptr float, ptr addrspace(1) %1, i64 %11, !dbg !16
21
+ %19 = bitcast float %16 to i32, !dbg !17
22
+ %20 = bitcast float %17 to i32, !dbg !17
23
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %19, i32 %20, ptr addrspace(1) %18, i1 %10) #1, !dbg !17
24
+ ret void, !dbg !18
25
+ }
26
+
27
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
28
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
29
+
30
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
31
+ attributes #1 = { nounwind }
32
+
33
+ !llvm.module.flags = !{!0}
34
+ !llvm.dbg.cu = !{!1}
35
+ !nvvm.annotations = !{!3, !4, !4, !3}
36
+
37
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
38
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
39
+ !2 = !DIFile(filename: "cmxm2obucqff2z4vc55zcnscfuvur5s2b3e36dvgm57qobanlpho.py", directory: "/tmp/torchinductor_root/mx")
40
+ !3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
41
+ !4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 256}
42
+ !5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
43
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
44
+ !7 = !{}
45
+ !8 = !DILocation(line: 21, column: 36, scope: !5)
46
+ !9 = !DILocation(line: 20, column: 28, scope: !5)
47
+ !10 = !DILocation(line: 20, column: 33, scope: !5)
48
+ !11 = !DILocation(line: 21, column: 23, scope: !5)
49
+ !12 = !DILocation(line: 22, column: 21, scope: !5)
50
+ !13 = !DILocation(line: 24, column: 30, scope: !5)
51
+ !14 = !DILocation(line: 24, column: 35, scope: !5)
52
+ !15 = !DILocation(line: 24, column: 45, scope: !5)
53
+ !16 = !DILocation(line: 26, column: 25, scope: !5)
54
+ !17 = !DILocation(line: 26, column: 36, scope: !5)
55
+ !18 = !DILocation(line: 26, column: 4, scope: !5)
.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.ptx ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2de
10
+
11
+ .visible .entry triton__0d1d2de(
12
+ .param .u64 triton__0d1d2de_param_0,
13
+ .param .u64 triton__0d1d2de_param_1,
14
+ .param .u32 triton__0d1d2de_param_2
15
+ )
16
+ .maxntid 256, 1, 1
17
+ {
18
+ .reg .pred %p<3>;
19
+ .reg .b16 %rs<3>;
20
+ .reg .b32 %r<12>;
21
+ .reg .b64 %rd<7>;
22
+ .loc 1 18 0
23
+ $L__func_begin0:
24
+ .loc 1 18 0
25
+
26
+ ld.param.u64 %rd3, [triton__0d1d2de_param_0];
27
+ ld.param.u64 %rd4, [triton__0d1d2de_param_1];
28
+ $L__tmp0:
29
+ .loc 1 21 36
30
+ mov.u32 %r7, %tid.x;
31
+ shl.b32 %r8, %r7, 1;
32
+ and.b32 %r9, %r8, 510;
33
+ .loc 1 20 28
34
+ mov.u32 %r1, %ctaid.x;
35
+ .loc 1 20 33
36
+ shl.b32 %r10, %r1, 9;
37
+ .loc 1 21 23
38
+ or.b32 %r11, %r10, %r9;
39
+ .loc 1 22 21
40
+ setp.lt.s32 %p1, %r11, 12865792;
41
+ .loc 1 24 30
42
+ mul.wide.s32 %rd5, %r11, 2;
43
+ add.s64 %rd1, %rd3, %rd5;
44
+ .loc 1 24 35
45
+ mov.u32 %r2, 0x0;
46
+ @%p1 ld.global.b32 { %r2 }, [ %rd1 + 0 ];
47
+ cvt.u16.u32 %rs1, %r2;
48
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
49
+ .loc 1 24 45
50
+ cvt.f32.bf16 %r5, %rs1;
51
+ cvt.f32.bf16 %r6, %rs2;
52
+ .loc 1 26 25
53
+ mul.wide.s32 %rd6, %r11, 4;
54
+ add.s64 %rd2, %rd4, %rd6;
55
+ .loc 1 26 36
56
+ @%p1 st.global.v2.b32 [ %rd2 + 0 ], { %r5, %r6 };
57
+ .loc 1 26 4
58
+ ret;
59
+ $L__tmp1:
60
+ $L__func_end0:
61
+
62
+ }
63
+ .file 1 "/tmp/torchinductor_root/mx/cmxm2obucqff2z4vc55zcnscfuvur5s2b3e36dvgm57qobanlpho.py"
64
+ .section .debug_abbrev
65
+ {
66
+ .b8 1
67
+ .b8 17
68
+ .b8 1
69
+ .b8 37
70
+ .b8 8
71
+ .b8 19
72
+ .b8 5
73
+ .b8 3
74
+ .b8 8
75
+ .b8 16
76
+ .b8 6
77
+ .b8 27
78
+ .b8 8
79
+ .b8 180
80
+ .b8 66
81
+ .b8 12
82
+ .b8 17
83
+ .b8 1
84
+ .b8 18
85
+ .b8 1
86
+ .b8 0
87
+ .b8 0
88
+ .b8 2
89
+ .b8 46
90
+ .b8 0
91
+ .b8 17
92
+ .b8 1
93
+ .b8 18
94
+ .b8 1
95
+ .b8 64
96
+ .b8 10
97
+ .b8 135
98
+ .b8 64
99
+ .b8 8
100
+ .b8 3
101
+ .b8 8
102
+ .b8 58
103
+ .b8 11
104
+ .b8 59
105
+ .b8 11
106
+ .b8 63
107
+ .b8 12
108
+ .b8 0
109
+ .b8 0
110
+ .b8 0
111
+ }
112
+ .section .debug_info
113
+ {
114
+ .b32 176
115
+ .b8 2
116
+ .b8 0
117
+ .b32 .debug_abbrev
118
+ .b8 8
119
+ .b8 1
120
+ .b8 116
121
+ .b8 114
122
+ .b8 105
123
+ .b8 116
124
+ .b8 111
125
+ .b8 110
126
+ .b8 0
127
+ .b8 2
128
+ .b8 0
129
+ .b8 99
130
+ .b8 109
131
+ .b8 120
132
+ .b8 109
133
+ .b8 50
134
+ .b8 111
135
+ .b8 98
136
+ .b8 117
137
+ .b8 99
138
+ .b8 113
139
+ .b8 102
140
+ .b8 102
141
+ .b8 50
142
+ .b8 122
143
+ .b8 52
144
+ .b8 118
145
+ .b8 99
146
+ .b8 53
147
+ .b8 53
148
+ .b8 122
149
+ .b8 99
150
+ .b8 110
151
+ .b8 115
152
+ .b8 99
153
+ .b8 102
154
+ .b8 117
155
+ .b8 118
156
+ .b8 117
157
+ .b8 114
158
+ .b8 53
159
+ .b8 115
160
+ .b8 50
161
+ .b8 98
162
+ .b8 51
163
+ .b8 101
164
+ .b8 51
165
+ .b8 54
166
+ .b8 100
167
+ .b8 118
168
+ .b8 103
169
+ .b8 109
170
+ .b8 53
171
+ .b8 55
172
+ .b8 113
173
+ .b8 111
174
+ .b8 98
175
+ .b8 97
176
+ .b8 110
177
+ .b8 108
178
+ .b8 112
179
+ .b8 104
180
+ .b8 111
181
+ .b8 46
182
+ .b8 112
183
+ .b8 121
184
+ .b8 0
185
+ .b32 .debug_line
186
+ .b8 47
187
+ .b8 116
188
+ .b8 109
189
+ .b8 112
190
+ .b8 47
191
+ .b8 116
192
+ .b8 111
193
+ .b8 114
194
+ .b8 99
195
+ .b8 104
196
+ .b8 105
197
+ .b8 110
198
+ .b8 100
199
+ .b8 117
200
+ .b8 99
201
+ .b8 116
202
+ .b8 111
203
+ .b8 114
204
+ .b8 95
205
+ .b8 114
206
+ .b8 111
207
+ .b8 111
208
+ .b8 116
209
+ .b8 47
210
+ .b8 109
211
+ .b8 120
212
+ .b8 0
213
+ .b8 1
214
+ .b64 $L__func_begin0
215
+ .b64 $L__func_end0
216
+ .b8 2
217
+ .b64 $L__func_begin0
218
+ .b64 $L__func_end0
219
+ .b8 1
220
+ .b8 156
221
+ .b8 116
222
+ .b8 114
223
+ .b8 105
224
+ .b8 116
225
+ .b8 111
226
+ .b8 110
227
+ .b8 95
228
+ .b8 95
229
+ .b8 48
230
+ .b8 100
231
+ .b8 49
232
+ .b8 100
233
+ .b8 50
234
+ .b8 100
235
+ .b8 101
236
+ .b8 0
237
+ .b8 116
238
+ .b8 114
239
+ .b8 105
240
+ .b8 116
241
+ .b8 111
242
+ .b8 110
243
+ .b8 95
244
+ .b8 95
245
+ .b8 48
246
+ .b8 100
247
+ .b8 49
248
+ .b8 100
249
+ .b8 50
250
+ .b8 100
251
+ .b8 101
252
+ .b8 0
253
+ .b8 1
254
+ .b8 18
255
+ .b8 1
256
+ .b8 0
257
+ }
258
+ .section .debug_pubnames
259
+ {
260
+ .b32 $L__pubNames_end0-$L__pubNames_start0
261
+ $L__pubNames_start0:
262
+ .b8 2
263
+ .b8 0
264
+ .b32 .debug_info
265
+ .b32 180
266
+ .b32 125
267
+ .b8 116
268
+ .b8 114
269
+ .b8 105
270
+ .b8 116
271
+ .b8 111
272
+ .b8 110
273
+ .b8 95
274
+ .b8 95
275
+ .b8 48
276
+ .b8 100
277
+ .b8 49
278
+ .b8 100
279
+ .b8 50
280
+ .b8 100
281
+ .b8 101
282
+ .b8 0
283
+ .b32 0
284
+ $L__pubNames_end0:
285
+ }
286
+ .section .debug_pubtypes
287
+ {
288
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
289
+ $L__pubTypes_start0:
290
+ .b8 2
291
+ .b8 0
292
+ .b32 .debug_info
293
+ .b32 180
294
+ .b32 0
295
+ $L__pubTypes_end0:
296
+ }
297
+ .section .debug_loc { }
.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.ttgir ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<12865792> : tensor<512xi32, #blocked>
5
+ %c512_i32 = arith.constant 512 : i32
6
+ %0 = tt.get_program_id x : i32
7
+ %1 = arith.muli %0, %c512_i32 : i32
8
+ %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
9
+ %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
10
+ %4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
11
+ %5 = arith.cmpi slt, %4, %cst : tensor<512xi32, #blocked>
12
+ %6 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
13
+ %7 = tt.addptr %6, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
14
+ %8 = tt.load %7, %5 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16, #blocked>
15
+ %9 = arith.extf %8 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked>
16
+ %10 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
17
+ %11 = tt.addptr %10, %4 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi32, #blocked>
18
+ tt.store %11, %9, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked>
19
+ tt.return
20
+ }
21
+ }
.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.ttir ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<12865792> : tensor<512xi32>
4
+ %c512_i32 = arith.constant 512 : i32
5
+ %0 = tt.get_program_id x : i32
6
+ %1 = arith.muli %0, %c512_i32 : i32
7
+ %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
8
+ %3 = tt.splat %1 : (i32) -> tensor<512xi32>
9
+ %4 = arith.addi %3, %2 : tensor<512xi32>
10
+ %5 = arith.cmpi slt, %4, %cst : tensor<512xi32>
11
+ %6 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
12
+ %7 = tt.addptr %6, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
13
+ %8 = tt.load %7, %5 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16>
14
+ %9 = arith.extf %8 : tensor<512xbf16> to tensor<512xf32>
15
+ %10 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>>
16
+ %11 = tt.addptr %10, %4 : tensor<512x!tt.ptr<f32, 1>>, tensor<512xi32>
17
+ tt.store %11, %9, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32>
18
+ tt.return
19
+ }
20
+ }
.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.cubin ADDED
Binary file (36.4 kB). View file
 
.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.cubin ADDED
Binary file (4.65 kB). View file
 
.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.llir ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ define void @triton__0d1de(ptr addrspace(1) %0, i64 %1) local_unnamed_addr !dbg !5 {
5
+ %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
6
+ %4 = shl i32 %3, 2, !dbg !8
7
+ %5 = and i32 %4, 508, !dbg !8
8
+ %6 = or i32 %5, 512, !dbg !8
9
+ %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
10
+ %8 = sext i32 %7 to i64, !dbg !10
11
+ %9 = shl nsw i64 %8, 10, !dbg !11
12
+ %10 = zext nneg i32 %5 to i64
13
+ %11 = zext nneg i32 %6 to i64
14
+ %12 = or i64 %9, %10, !dbg !12
15
+ %13 = or i64 %9, %11, !dbg !12
16
+ %14 = getelementptr float, ptr addrspace(1) %0, i64 %12, !dbg !13
17
+ %15 = getelementptr float, ptr addrspace(1) %0, i64 %13, !dbg !13
18
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %14, i1 true) #1, !dbg !14
19
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %15, i1 true) #1, !dbg !14
20
+ ret void, !dbg !15
21
+ }
22
+
23
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
24
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
25
+
26
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
27
+ attributes #1 = { nounwind }
28
+
29
+ !llvm.module.flags = !{!0}
30
+ !llvm.dbg.cu = !{!1}
31
+ !nvvm.annotations = !{!3, !4, !4, !3}
32
+
33
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
34
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
35
+ !2 = !DIFile(filename: "cpkw3bdoamlgzvqjeyuk34b3jcjf57htisara7lukflexo3t22ew.py", directory: "/tmp/torchinductor_root/pk")
36
+ !3 = !{ptr @triton__0d1de, !"kernel", i32 1}
37
+ !4 = !{ptr @triton__0d1de, !"maxntidx", i32 128}
38
+ !5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
39
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
40
+ !7 = !{}
41
+ !8 = !DILocation(line: 21, column: 36, scope: !5)
42
+ !9 = !DILocation(line: 20, column: 28, scope: !5)
43
+ !10 = !DILocation(line: 20, column: 34, scope: !5)
44
+ !11 = !DILocation(line: 20, column: 46, scope: !5)
45
+ !12 = !DILocation(line: 21, column: 23, scope: !5)
46
+ !13 = !DILocation(line: 25, column: 25, scope: !5)
47
+ !14 = !DILocation(line: 25, column: 36, scope: !5)
48
+ !15 = !DILocation(line: 25, column: 4, scope: !5)
.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ptx ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1de
10
+
11
+ .visible .entry triton__0d1de(
12
+ .param .u64 triton__0d1de_param_0,
13
+ .param .u64 triton__0d1de_param_1
14
+ )
15
+ .maxntid 128, 1, 1
16
+ {
17
+ .reg .pred %p<3>;
18
+ .reg .b32 %r<13>;
19
+ .reg .b64 %rd<8>;
20
+ .loc 1 18 0
21
+ $L__func_begin0:
22
+ .loc 1 18 0
23
+
24
+ ld.param.u64 %rd3, [triton__0d1de_param_0];
25
+ $L__tmp0:
26
+ .loc 1 21 36
27
+ mov.u32 %r10, %tid.x;
28
+ shl.b32 %r11, %r10, 2;
29
+ and.b32 %r12, %r11, 508;
30
+ .loc 1 20 28
31
+ mov.u32 %r1, %ctaid.x;
32
+ .loc 1 20 46
33
+ mul.wide.s32 %rd4, %r1, 1024;
34
+ cvt.u64.u32 %rd5, %r12;
35
+ .loc 1 21 23
36
+ or.b64 %rd6, %rd4, %rd5;
37
+ .loc 1 25 25
38
+ shl.b64 %rd7, %rd6, 2;
39
+ add.s64 %rd1, %rd3, %rd7;
40
+ add.s64 %rd2, %rd1, 2048;
41
+ mov.b32 %r2, 0;
42
+ mov.pred %p1, -1;
43
+ .loc 1 25 36
44
+ @%p1 st.global.v4.b32 [ %rd1 + 0 ], { %r2, %r2, %r2, %r2 };
45
+ @%p1 st.global.v4.b32 [ %rd2 + 0 ], { %r2, %r2, %r2, %r2 };
46
+ .loc 1 25 4
47
+ ret;
48
+ $L__tmp1:
49
+ $L__func_end0:
50
+
51
+ }
52
+ .file 1 "/tmp/torchinductor_root/pk/cpkw3bdoamlgzvqjeyuk34b3jcjf57htisara7lukflexo3t22ew.py"
53
+ .section .debug_abbrev
54
+ {
55
+ .b8 1
56
+ .b8 17
57
+ .b8 1
58
+ .b8 37
59
+ .b8 8
60
+ .b8 19
61
+ .b8 5
62
+ .b8 3
63
+ .b8 8
64
+ .b8 16
65
+ .b8 6
66
+ .b8 27
67
+ .b8 8
68
+ .b8 180
69
+ .b8 66
70
+ .b8 12
71
+ .b8 17
72
+ .b8 1
73
+ .b8 18
74
+ .b8 1
75
+ .b8 0
76
+ .b8 0
77
+ .b8 2
78
+ .b8 46
79
+ .b8 0
80
+ .b8 17
81
+ .b8 1
82
+ .b8 18
83
+ .b8 1
84
+ .b8 64
85
+ .b8 10
86
+ .b8 135
87
+ .b8 64
88
+ .b8 8
89
+ .b8 3
90
+ .b8 8
91
+ .b8 58
92
+ .b8 11
93
+ .b8 59
94
+ .b8 11
95
+ .b8 63
96
+ .b8 12
97
+ .b8 0
98
+ .b8 0
99
+ .b8 0
100
+ }
101
+ .section .debug_info
102
+ {
103
+ .b32 172
104
+ .b8 2
105
+ .b8 0
106
+ .b32 .debug_abbrev
107
+ .b8 8
108
+ .b8 1
109
+ .b8 116
110
+ .b8 114
111
+ .b8 105
112
+ .b8 116
113
+ .b8 111
114
+ .b8 110
115
+ .b8 0
116
+ .b8 2
117
+ .b8 0
118
+ .b8 99
119
+ .b8 112
120
+ .b8 107
121
+ .b8 119
122
+ .b8 51
123
+ .b8 98
124
+ .b8 100
125
+ .b8 111
126
+ .b8 97
127
+ .b8 109
128
+ .b8 108
129
+ .b8 103
130
+ .b8 122
131
+ .b8 118
132
+ .b8 113
133
+ .b8 106
134
+ .b8 101
135
+ .b8 121
136
+ .b8 117
137
+ .b8 107
138
+ .b8 51
139
+ .b8 52
140
+ .b8 98
141
+ .b8 51
142
+ .b8 106
143
+ .b8 99
144
+ .b8 106
145
+ .b8 102
146
+ .b8 53
147
+ .b8 55
148
+ .b8 104
149
+ .b8 116
150
+ .b8 105
151
+ .b8 115
152
+ .b8 97
153
+ .b8 114
154
+ .b8 97
155
+ .b8 55
156
+ .b8 108
157
+ .b8 117
158
+ .b8 107
159
+ .b8 102
160
+ .b8 108
161
+ .b8 101
162
+ .b8 120
163
+ .b8 111
164
+ .b8 51
165
+ .b8 116
166
+ .b8 50
167
+ .b8 50
168
+ .b8 101
169
+ .b8 119
170
+ .b8 46
171
+ .b8 112
172
+ .b8 121
173
+ .b8 0
174
+ .b32 .debug_line
175
+ .b8 47
176
+ .b8 116
177
+ .b8 109
178
+ .b8 112
179
+ .b8 47
180
+ .b8 116
181
+ .b8 111
182
+ .b8 114
183
+ .b8 99
184
+ .b8 104
185
+ .b8 105
186
+ .b8 110
187
+ .b8 100
188
+ .b8 117
189
+ .b8 99
190
+ .b8 116
191
+ .b8 111
192
+ .b8 114
193
+ .b8 95
194
+ .b8 114
195
+ .b8 111
196
+ .b8 111
197
+ .b8 116
198
+ .b8 47
199
+ .b8 112
200
+ .b8 107
201
+ .b8 0
202
+ .b8 1
203
+ .b64 $L__func_begin0
204
+ .b64 $L__func_end0
205
+ .b8 2
206
+ .b64 $L__func_begin0
207
+ .b64 $L__func_end0
208
+ .b8 1
209
+ .b8 156
210
+ .b8 116
211
+ .b8 114
212
+ .b8 105
213
+ .b8 116
214
+ .b8 111
215
+ .b8 110
216
+ .b8 95
217
+ .b8 95
218
+ .b8 48
219
+ .b8 100
220
+ .b8 49
221
+ .b8 100
222
+ .b8 101
223
+ .b8 0
224
+ .b8 116
225
+ .b8 114
226
+ .b8 105
227
+ .b8 116
228
+ .b8 111
229
+ .b8 110
230
+ .b8 95
231
+ .b8 95
232
+ .b8 48
233
+ .b8 100
234
+ .b8 49
235
+ .b8 100
236
+ .b8 101
237
+ .b8 0
238
+ .b8 1
239
+ .b8 18
240
+ .b8 1
241
+ .b8 0
242
+ }
243
+ .section .debug_pubnames
244
+ {
245
+ .b32 $L__pubNames_end0-$L__pubNames_start0
246
+ $L__pubNames_start0:
247
+ .b8 2
248
+ .b8 0
249
+ .b32 .debug_info
250
+ .b32 176
251
+ .b32 125
252
+ .b8 116
253
+ .b8 114
254
+ .b8 105
255
+ .b8 116
256
+ .b8 111
257
+ .b8 110
258
+ .b8 95
259
+ .b8 95
260
+ .b8 48
261
+ .b8 100
262
+ .b8 49
263
+ .b8 100
264
+ .b8 101
265
+ .b8 0
266
+ .b32 0
267
+ $L__pubNames_end0:
268
+ }
269
+ .section .debug_pubtypes
270
+ {
271
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
272
+ $L__pubTypes_start0:
273
+ .b8 2
274
+ .b8 0
275
+ .b32 .debug_info
276
+ .b32 176
277
+ .b32 0
278
+ $L__pubTypes_end0:
279
+ }
280
+ .section .debug_loc { }
.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ttgir ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked>
5
+ %c1024_i64 = arith.constant 1024 : i64
6
+ %0 = tt.get_program_id x : i32
7
+ %1 = arith.extsi %0 : i32 to i64
8
+ %2 = arith.muli %1, %c1024_i64 : i64
9
+ %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
10
+ %4 = arith.extsi %3 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked>
11
+ %5 = tt.splat %2 : (i64) -> tensor<1024xi64, #blocked>
12
+ %6 = arith.addi %5, %4 : tensor<1024xi64, #blocked>
13
+ %7 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked>
14
+ %8 = tt.addptr %7, %6 : tensor<1024x!tt.ptr<f32, 1>, #blocked>, tensor<1024xi64, #blocked>
15
+ tt.store %8, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked>
16
+ tt.return
17
+ }
18
+ }
.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ttir ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c1024_i64 = arith.constant 1024 : i64
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<1024xf32>
5
+ %0 = tt.get_program_id x : i32
6
+ %1 = arith.extsi %0 : i32 to i64
7
+ %2 = arith.muli %1, %c1024_i64 : i64
8
+ %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
9
+ %4 = arith.extsi %3 : tensor<1024xi32> to tensor<1024xi64>
10
+ %5 = tt.splat %2 : (i64) -> tensor<1024xi64>
11
+ %6 = arith.addi %5, %4 : tensor<1024xi64>
12
+ %7 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
13
+ %8 = tt.addptr %7, %6 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi64>
14
+ tt.store %8, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32>
15
+ tt.return
16
+ }
17
+ }
.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.llir ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
7
+ %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %5 = and i32 %4, 127, !dbg !8
9
+ %6 = shl nuw nsw i32 %5, 3, !dbg !8
10
+ %7 = shl nuw nsw i32 %5, 2, !dbg !8
11
+ %8 = or i32 %7, 512, !dbg !8
12
+ %9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !9
13
+ %10 = shl i32 %9, 10, !dbg !10
14
+ %11 = or i32 %10, %6, !dbg !11
15
+ %12 = or i32 %10, %7, !dbg !11
16
+ %13 = or i32 %10, %8, !dbg !11
17
+ %14 = icmp slt i32 %11, 12865792, !dbg !12
18
+ %15 = icmp slt i32 %12, 12865792, !dbg !12
19
+ %16 = icmp slt i32 %13, 12865792, !dbg !12
20
+ %17 = sext i32 %11 to i64, !dbg !13
21
+ %18 = getelementptr i16, ptr addrspace(1) %0, i64 %17, !dbg !13
22
+ %19 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %18, i1 %14) #2, !dbg !14
23
+ %20 = extractvalue { i32, i32, i32, i32 } %19, 0, !dbg !14
24
+ %21 = extractvalue { i32, i32, i32, i32 } %19, 1, !dbg !14
25
+ %22 = extractvalue { i32, i32, i32, i32 } %19, 2, !dbg !14
26
+ %23 = extractvalue { i32, i32, i32, i32 } %19, 3, !dbg !14
27
+ %24 = trunc i32 %20 to i16, !dbg !14
28
+ %extelt.offset = lshr i32 %20, 16, !dbg !14
29
+ %25 = trunc i32 %extelt.offset to i16, !dbg !14
30
+ %26 = trunc i32 %21 to i16, !dbg !14
31
+ %extelt.offset1 = lshr i32 %21, 16, !dbg !14
32
+ %27 = trunc i32 %extelt.offset1 to i16, !dbg !14
33
+ %28 = trunc i32 %22 to i16, !dbg !14
34
+ %extelt.offset2 = lshr i32 %22, 16, !dbg !14
35
+ %29 = trunc i32 %extelt.offset2 to i16, !dbg !14
36
+ %30 = trunc i32 %23 to i16, !dbg !14
37
+ %extelt.offset3 = lshr i32 %23, 16, !dbg !14
38
+ %31 = trunc i32 %extelt.offset3 to i16, !dbg !14
39
+ %32 = zext nneg i32 %6 to i64, !dbg !15
40
+ %33 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %32, !dbg !15
41
+ %34 = insertelement <1 x i16> undef, i16 %24, i64 0, !dbg !15
42
+ store <1 x i16> %34, ptr addrspace(3) %33, align 2, !dbg !15
43
+ %35 = or i32 %6, 1, !dbg !15
44
+ %36 = zext nneg i32 %35 to i64, !dbg !15
45
+ %37 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %36, !dbg !15
46
+ %38 = insertelement <1 x i16> undef, i16 %25, i64 0, !dbg !15
47
+ store <1 x i16> %38, ptr addrspace(3) %37, align 2, !dbg !15
48
+ %39 = or i32 %6, 2, !dbg !15
49
+ %40 = zext nneg i32 %39 to i64, !dbg !15
50
+ %41 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %40, !dbg !15
51
+ %42 = insertelement <1 x i16> undef, i16 %26, i64 0, !dbg !15
52
+ store <1 x i16> %42, ptr addrspace(3) %41, align 2, !dbg !15
53
+ %43 = or i32 %6, 3, !dbg !15
54
+ %44 = zext nneg i32 %43 to i64, !dbg !15
55
+ %45 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %44, !dbg !15
56
+ %46 = insertelement <1 x i16> undef, i16 %27, i64 0, !dbg !15
57
+ store <1 x i16> %46, ptr addrspace(3) %45, align 2, !dbg !15
58
+ %47 = or i32 %6, 4, !dbg !15
59
+ %48 = zext nneg i32 %47 to i64, !dbg !15
60
+ %49 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %48, !dbg !15
61
+ %50 = insertelement <1 x i16> undef, i16 %28, i64 0, !dbg !15
62
+ store <1 x i16> %50, ptr addrspace(3) %49, align 2, !dbg !15
63
+ %51 = or i32 %6, 5, !dbg !15
64
+ %52 = zext nneg i32 %51 to i64, !dbg !15
65
+ %53 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %52, !dbg !15
66
+ %54 = insertelement <1 x i16> undef, i16 %29, i64 0, !dbg !15
67
+ store <1 x i16> %54, ptr addrspace(3) %53, align 2, !dbg !15
68
+ %55 = or i32 %6, 6, !dbg !15
69
+ %56 = zext nneg i32 %55 to i64, !dbg !15
70
+ %57 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %56, !dbg !15
71
+ %58 = insertelement <1 x i16> undef, i16 %30, i64 0, !dbg !15
72
+ store <1 x i16> %58, ptr addrspace(3) %57, align 2, !dbg !15
73
+ %59 = or i32 %6, 7, !dbg !15
74
+ %60 = zext nneg i32 %59 to i64, !dbg !15
75
+ %61 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %60, !dbg !15
76
+ %62 = insertelement <1 x i16> undef, i16 %31, i64 0, !dbg !15
77
+ store <1 x i16> %62, ptr addrspace(3) %61, align 2, !dbg !15
78
+ tail call void @llvm.nvvm.barrier0(), !dbg !15
79
+ %63 = zext nneg i32 %7 to i64, !dbg !15
80
+ %64 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %63, !dbg !15
81
+ %65 = load i16, ptr addrspace(3) %64, align 2, !dbg !15
82
+ %66 = or i32 %7, 1, !dbg !15
83
+ %67 = zext nneg i32 %66 to i64, !dbg !15
84
+ %68 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %67, !dbg !15
85
+ %69 = load i16, ptr addrspace(3) %68, align 2, !dbg !15
86
+ %70 = or i32 %7, 2, !dbg !15
87
+ %71 = zext nneg i32 %70 to i64, !dbg !15
88
+ %72 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %71, !dbg !15
89
+ %73 = load i16, ptr addrspace(3) %72, align 2, !dbg !15
90
+ %74 = or i32 %7, 3, !dbg !15
91
+ %75 = zext nneg i32 %74 to i64, !dbg !15
92
+ %76 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %75, !dbg !15
93
+ %77 = load i16, ptr addrspace(3) %76, align 2, !dbg !15
94
+ %78 = zext nneg i32 %8 to i64, !dbg !15
95
+ %79 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %78, !dbg !15
96
+ %80 = load i16, ptr addrspace(3) %79, align 2, !dbg !15
97
+ %81 = or i32 %7, 513, !dbg !15
98
+ %82 = zext nneg i32 %81 to i64, !dbg !15
99
+ %83 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %82, !dbg !15
100
+ %84 = load i16, ptr addrspace(3) %83, align 2, !dbg !15
101
+ %85 = or i32 %7, 514, !dbg !15
102
+ %86 = zext nneg i32 %85 to i64, !dbg !15
103
+ %87 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %86, !dbg !15
104
+ %88 = load i16, ptr addrspace(3) %87, align 2, !dbg !15
105
+ %89 = or i32 %7, 515, !dbg !15
106
+ %90 = zext nneg i32 %89 to i64, !dbg !15
107
+ %91 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %90, !dbg !15
108
+ %92 = load i16, ptr addrspace(3) %91, align 2, !dbg !15
109
+ %93 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %65) #2, !dbg !15
110
+ %94 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %69) #2, !dbg !15
111
+ %95 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %73) #2, !dbg !15
112
+ %96 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %77) #2, !dbg !15
113
+ %97 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %80) #2, !dbg !15
114
+ %98 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %84) #2, !dbg !15
115
+ %99 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %88) #2, !dbg !15
116
+ %100 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %92) #2, !dbg !15
117
+ %101 = sext i32 %12 to i64, !dbg !16
118
+ %102 = getelementptr float, ptr addrspace(1) %1, i64 %101, !dbg !16
119
+ %103 = sext i32 %13 to i64, !dbg !16
120
+ %104 = getelementptr float, ptr addrspace(1) %1, i64 %103, !dbg !16
121
+ %105 = bitcast float %93 to i32, !dbg !17
122
+ %106 = bitcast float %94 to i32, !dbg !17
123
+ %107 = bitcast float %95 to i32, !dbg !17
124
+ %108 = bitcast float %96 to i32, !dbg !17
125
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %105, i32 %106, i32 %107, i32 %108, ptr addrspace(1) %102, i1 %15) #2, !dbg !17
126
+ %109 = bitcast float %97 to i32, !dbg !17
127
+ %110 = bitcast float %98 to i32, !dbg !17
128
+ %111 = bitcast float %99 to i32, !dbg !17
129
+ %112 = bitcast float %100 to i32, !dbg !17
130
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %109, i32 %110, i32 %111, i32 %112, ptr addrspace(1) %104, i1 %16) #2, !dbg !17
131
+ ret void, !dbg !18
132
+ }
133
+
134
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
135
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
136
+
137
+ ; Function Attrs: convergent nocallback nounwind
138
+ declare void @llvm.nvvm.barrier0() #1
139
+
140
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
141
+ attributes #1 = { convergent nocallback nounwind }
142
+ attributes #2 = { nounwind }
143
+
144
+ !llvm.module.flags = !{!0}
145
+ !llvm.dbg.cu = !{!1}
146
+ !nvvm.annotations = !{!3, !4, !4, !3}
147
+
148
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
149
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
150
+ !2 = !DIFile(filename: "cmxm2obucqff2z4vc55zcnscfuvur5s2b3e36dvgm57qobanlpho.py", directory: "/tmp/torchinductor_root/mx")
151
+ !3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
152
+ !4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
153
+ !5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
154
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
155
+ !7 = !{}
156
+ !8 = !DILocation(line: 21, column: 36, scope: !5)
157
+ !9 = !DILocation(line: 20, column: 28, scope: !5)
158
+ !10 = !DILocation(line: 20, column: 33, scope: !5)
159
+ !11 = !DILocation(line: 21, column: 23, scope: !5)
160
+ !12 = !DILocation(line: 22, column: 21, scope: !5)
161
+ !13 = !DILocation(line: 24, column: 30, scope: !5)
162
+ !14 = !DILocation(line: 24, column: 35, scope: !5)
163
+ !15 = !DILocation(line: 24, column: 45, scope: !5)
164
+ !16 = !DILocation(line: 26, column: 25, scope: !5)
165
+ !17 = !DILocation(line: 26, column: 36, scope: !5)
166
+ !18 = !DILocation(line: 26, column: 4, scope: !5)
.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ptx ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2de(
13
+ .param .u64 triton__0d1d2de_param_0,
14
+ .param .u64 triton__0d1d2de_param_1,
15
+ .param .u32 triton__0d1d2de_param_2
16
+ )
17
+ .maxntid 128, 1, 1
18
+ {
19
+ .reg .pred %p<4>;
20
+ .reg .b16 %rs<9>;
21
+ .reg .b32 %r<38>;
22
+ .reg .b64 %rd<13>;
23
+ .loc 1 18 0
24
+ $L__func_begin0:
25
+ .loc 1 18 0
26
+
27
+ ld.param.u64 %rd4, [triton__0d1d2de_param_0];
28
+ ld.param.u64 %rd5, [triton__0d1d2de_param_1];
29
+ $L__tmp0:
30
+ .loc 1 21 36
31
+ mov.u32 %r22, %tid.x;
32
+ and.b32 %r23, %r22, 127;
33
+ shl.b32 %r24, %r23, 3;
34
+ shl.b32 %r25, %r23, 2;
35
+ .loc 1 20 28
36
+ mov.u32 %r1, %ctaid.x;
37
+ .loc 1 20 33
38
+ shl.b32 %r26, %r1, 10;
39
+ .loc 1 21 23
40
+ or.b32 %r27, %r26, %r24;
41
+ or.b32 %r28, %r26, %r25;
42
+ or.b32 %r29, %r28, 512;
43
+ .loc 1 22 21
44
+ setp.lt.s32 %p1, %r27, 12865792;
45
+ setp.lt.s32 %p2, %r28, 12865792;
46
+ setp.lt.s32 %p3, %r29, 12865792;
47
+ .loc 1 24 30
48
+ mul.wide.s32 %rd6, %r27, 2;
49
+ add.s64 %rd1, %rd4, %rd6;
50
+ .loc 1 24 35
51
+ mov.u32 %r2, 0x0;
52
+ mov.u32 %r3, 0x0;
53
+ mov.u32 %r4, 0x0;
54
+ mov.u32 %r5, 0x0;
55
+ @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
56
+ shr.u32 %r30, %r2, 16;
57
+ shr.u32 %r31, %r3, 16;
58
+ shr.u32 %r32, %r4, 16;
59
+ shr.u32 %r33, %r5, 16;
60
+ .loc 1 24 45
61
+ shl.b32 %r34, %r23, 4;
62
+ mov.u32 %r35, global_smem;
63
+ add.s32 %r36, %r35, %r34;
64
+ st.shared.u16 [%r36], %r2;
65
+ st.shared.u16 [%r36+2], %r30;
66
+ st.shared.u16 [%r36+4], %r3;
67
+ st.shared.u16 [%r36+6], %r31;
68
+ st.shared.u16 [%r36+8], %r4;
69
+ st.shared.u16 [%r36+10], %r32;
70
+ st.shared.u16 [%r36+12], %r5;
71
+ st.shared.u16 [%r36+14], %r33;
72
+ bar.sync 0;
73
+ add.s32 %r37, %r35, %r24;
74
+ ld.shared.u16 %rs1, [%r37];
75
+ ld.shared.u16 %rs2, [%r37+2];
76
+ ld.shared.u16 %rs3, [%r37+4];
77
+ ld.shared.u16 %rs4, [%r37+6];
78
+ ld.shared.u16 %rs5, [%r37+1024];
79
+ ld.shared.u16 %rs6, [%r37+1026];
80
+ ld.shared.u16 %rs7, [%r37+1028];
81
+ ld.shared.u16 %rs8, [%r37+1030];
82
+ cvt.f32.bf16 %r14, %rs1;
83
+ cvt.f32.bf16 %r15, %rs2;
84
+ cvt.f32.bf16 %r16, %rs3;
85
+ cvt.f32.bf16 %r17, %rs4;
86
+ cvt.f32.bf16 %r18, %rs5;
87
+ cvt.f32.bf16 %r19, %rs6;
88
+ cvt.f32.bf16 %r20, %rs7;
89
+ cvt.f32.bf16 %r21, %rs8;
90
+ .loc 1 26 25
91
+ mul.wide.s32 %rd7, %r28, 4;
92
+ add.s64 %rd2, %rd5, %rd7;
93
+ cvt.s64.s32 %rd8, %r26;
94
+ cvt.u64.u32 %rd9, %r25;
95
+ or.b64 %rd10, %rd8, %rd9;
96
+ shl.b64 %rd11, %rd10, 2;
97
+ add.s64 %rd12, %rd5, %rd11;
98
+ add.s64 %rd3, %rd12, 2048;
99
+ .loc 1 26 36
100
+ @%p2 st.global.v4.b32 [ %rd2 + 0 ], { %r14, %r15, %r16, %r17 };
101
+ @%p3 st.global.v4.b32 [ %rd3 + 0 ], { %r18, %r19, %r20, %r21 };
102
+ .loc 1 26 4
103
+ ret;
104
+ $L__tmp1:
105
+ $L__func_end0:
106
+
107
+ }
108
+ .file 1 "/tmp/torchinductor_root/mx/cmxm2obucqff2z4vc55zcnscfuvur5s2b3e36dvgm57qobanlpho.py"
109
+ .section .debug_abbrev
110
+ {
111
+ .b8 1
112
+ .b8 17
113
+ .b8 1
114
+ .b8 37
115
+ .b8 8
116
+ .b8 19
117
+ .b8 5
118
+ .b8 3
119
+ .b8 8
120
+ .b8 16
121
+ .b8 6
122
+ .b8 27
123
+ .b8 8
124
+ .b8 180
125
+ .b8 66
126
+ .b8 12
127
+ .b8 17
128
+ .b8 1
129
+ .b8 18
130
+ .b8 1
131
+ .b8 0
132
+ .b8 0
133
+ .b8 2
134
+ .b8 46
135
+ .b8 0
136
+ .b8 17
137
+ .b8 1
138
+ .b8 18
139
+ .b8 1
140
+ .b8 64
141
+ .b8 10
142
+ .b8 135
143
+ .b8 64
144
+ .b8 8
145
+ .b8 3
146
+ .b8 8
147
+ .b8 58
148
+ .b8 11
149
+ .b8 59
150
+ .b8 11
151
+ .b8 63
152
+ .b8 12
153
+ .b8 0
154
+ .b8 0
155
+ .b8 0
156
+ }
157
+ .section .debug_info
158
+ {
159
+ .b32 176
160
+ .b8 2
161
+ .b8 0
162
+ .b32 .debug_abbrev
163
+ .b8 8
164
+ .b8 1
165
+ .b8 116
166
+ .b8 114
167
+ .b8 105
168
+ .b8 116
169
+ .b8 111
170
+ .b8 110
171
+ .b8 0
172
+ .b8 2
173
+ .b8 0
174
+ .b8 99
175
+ .b8 109
176
+ .b8 120
177
+ .b8 109
178
+ .b8 50
179
+ .b8 111
180
+ .b8 98
181
+ .b8 117
182
+ .b8 99
183
+ .b8 113
184
+ .b8 102
185
+ .b8 102
186
+ .b8 50
187
+ .b8 122
188
+ .b8 52
189
+ .b8 118
190
+ .b8 99
191
+ .b8 53
192
+ .b8 53
193
+ .b8 122
194
+ .b8 99
195
+ .b8 110
196
+ .b8 115
197
+ .b8 99
198
+ .b8 102
199
+ .b8 117
200
+ .b8 118
201
+ .b8 117
202
+ .b8 114
203
+ .b8 53
204
+ .b8 115
205
+ .b8 50
206
+ .b8 98
207
+ .b8 51
208
+ .b8 101
209
+ .b8 51
210
+ .b8 54
211
+ .b8 100
212
+ .b8 118
213
+ .b8 103
214
+ .b8 109
215
+ .b8 53
216
+ .b8 55
217
+ .b8 113
218
+ .b8 111
219
+ .b8 98
220
+ .b8 97
221
+ .b8 110
222
+ .b8 108
223
+ .b8 112
224
+ .b8 104
225
+ .b8 111
226
+ .b8 46
227
+ .b8 112
228
+ .b8 121
229
+ .b8 0
230
+ .b32 .debug_line
231
+ .b8 47
232
+ .b8 116
233
+ .b8 109
234
+ .b8 112
235
+ .b8 47
236
+ .b8 116
237
+ .b8 111
238
+ .b8 114
239
+ .b8 99
240
+ .b8 104
241
+ .b8 105
242
+ .b8 110
243
+ .b8 100
244
+ .b8 117
245
+ .b8 99
246
+ .b8 116
247
+ .b8 111
248
+ .b8 114
249
+ .b8 95
250
+ .b8 114
251
+ .b8 111
252
+ .b8 111
253
+ .b8 116
254
+ .b8 47
255
+ .b8 109
256
+ .b8 120
257
+ .b8 0
258
+ .b8 1
259
+ .b64 $L__func_begin0
260
+ .b64 $L__func_end0
261
+ .b8 2
262
+ .b64 $L__func_begin0
263
+ .b64 $L__func_end0
264
+ .b8 1
265
+ .b8 156
266
+ .b8 116
267
+ .b8 114
268
+ .b8 105
269
+ .b8 116
270
+ .b8 111
271
+ .b8 110
272
+ .b8 95
273
+ .b8 95
274
+ .b8 48
275
+ .b8 100
276
+ .b8 49
277
+ .b8 100
278
+ .b8 50
279
+ .b8 100
280
+ .b8 101
281
+ .b8 0
282
+ .b8 116
283
+ .b8 114
284
+ .b8 105
285
+ .b8 116
286
+ .b8 111
287
+ .b8 110
288
+ .b8 95
289
+ .b8 95
290
+ .b8 48
291
+ .b8 100
292
+ .b8 49
293
+ .b8 100
294
+ .b8 50
295
+ .b8 100
296
+ .b8 101
297
+ .b8 0
298
+ .b8 1
299
+ .b8 18
300
+ .b8 1
301
+ .b8 0
302
+ }
303
+ .section .debug_pubnames
304
+ {
305
+ .b32 $L__pubNames_end0-$L__pubNames_start0
306
+ $L__pubNames_start0:
307
+ .b8 2
308
+ .b8 0
309
+ .b32 .debug_info
310
+ .b32 180
311
+ .b32 125
312
+ .b8 116
313
+ .b8 114
314
+ .b8 105
315
+ .b8 116
316
+ .b8 111
317
+ .b8 110
318
+ .b8 95
319
+ .b8 95
320
+ .b8 48
321
+ .b8 100
322
+ .b8 49
323
+ .b8 100
324
+ .b8 50
325
+ .b8 100
326
+ .b8 101
327
+ .b8 0
328
+ .b32 0
329
+ $L__pubNames_end0:
330
+ }
331
+ .section .debug_pubtypes
332
+ {
333
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
334
+ $L__pubTypes_start0:
335
+ .b8 2
336
+ .b8 0
337
+ .b32 .debug_info
338
+ .b32 180
339
+ .b32 0
340
+ $L__pubTypes_end0:
341
+ }
342
+ .section .debug_loc { }
.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ttgir ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
5
+ %cst = arith.constant dense<12865792> : tensor<1024xi32, #blocked>
6
+ %cst_0 = arith.constant dense<12865792> : tensor<1024xi32, #blocked1>
7
+ %c1024_i32 = arith.constant 1024 : i32
8
+ %0 = tt.get_program_id x : i32
9
+ %1 = arith.muli %0, %c1024_i32 : i32
10
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
11
+ %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked1>
12
+ %4 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
13
+ %5 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked1>
14
+ %6 = arith.addi %4, %2 : tensor<1024xi32, #blocked>
15
+ %7 = arith.addi %5, %3 : tensor<1024xi32, #blocked1>
16
+ %8 = arith.cmpi slt, %6, %cst : tensor<1024xi32, #blocked>
17
+ %9 = arith.cmpi slt, %7, %cst_0 : tensor<1024xi32, #blocked1>
18
+ %10 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
19
+ %11 = tt.addptr %10, %6 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
20
+ %12 = tt.load %11, %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
21
+ %13 = triton_gpu.convert_layout %12 : (tensor<1024xbf16, #blocked>) -> tensor<1024xbf16, #blocked1>
22
+ %14 = arith.extf %13 : tensor<1024xbf16, #blocked1> to tensor<1024xf32, #blocked1>
23
+ %15 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked1>
24
+ %16 = tt.addptr %15, %7 : tensor<1024x!tt.ptr<f32, 1>, #blocked1>, tensor<1024xi32, #blocked1>
25
+ tt.store %16, %14, %9 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked1>
26
+ tt.return
27
+ }
28
+ }
.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ttir ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<12865792> : tensor<1024xi32>
4
+ %c1024_i32 = arith.constant 1024 : i32
5
+ %0 = tt.get_program_id x : i32
6
+ %1 = arith.muli %0, %c1024_i32 : i32
7
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
8
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
9
+ %4 = arith.addi %3, %2 : tensor<1024xi32>
10
+ %5 = arith.cmpi slt, %4, %cst : tensor<1024xi32>
11
+ %6 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
12
+ %7 = tt.addptr %6, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
13
+ %8 = tt.load %7, %5 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
14
+ %9 = arith.extf %8 : tensor<1024xbf16> to tensor<1024xf32>
15
+ %10 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
16
+ %11 = tt.addptr %10, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
17
+ tt.store %11, %9, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32>
18
+ tt.return
19
+ }
20
+ }
.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.cubin ADDED
Binary file (26 kB). View file
 
.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.llir ADDED
@@ -0,0 +1,476 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
5
+ @assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
6
+ @assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
7
+ @assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
8
+ @assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
9
+ @assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
10
+ @global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
11
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
12
+
13
+ declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
14
+
15
+ define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
16
+ %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
17
+ %9 = and i32 %8, 31, !dbg !10
18
+ %10 = lshr i32 %8, 5, !dbg !10
19
+ %11 = and i32 %10, 3, !dbg !10
20
+ %12 = lshr i32 %9, 1, !dbg !10
21
+ %13 = shl nuw nsw i32 %11, 4, !dbg !10
22
+ %14 = or i32 %13, %12, !dbg !10
23
+ %15 = and i32 %8, 63, !dbg !10
24
+ %16 = shl i32 %8, 2, !dbg !11
25
+ %17 = and i32 %16, 4, !dbg !11
26
+ %18 = and i32 %8, 7, !dbg !11
27
+ %19 = shl nuw nsw i32 %11, 2, !dbg !12
28
+ %20 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !13
29
+ %21 = shl i32 %20, 6, !dbg !14
30
+ %22 = or i32 %21, %14, !dbg !15
31
+ %23 = or i32 %21, %15, !dbg !15
32
+ %24 = sext i32 %22 to i64, !dbg !16
33
+ %25 = getelementptr i64, ptr addrspace(1) %0, i64 %24, !dbg !16
34
+ %26 = sext i32 %23 to i64, !dbg !16
35
+ %27 = getelementptr i64, ptr addrspace(1) %0, i64 %26, !dbg !16
36
+ %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
37
+ %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
38
+ %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
39
+ %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
40
+ %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %27, i1 true) #6, !dbg !17
41
+ %33 = srem i32 %22, 512, !dbg !18
42
+ %34 = shl nsw i32 %33, 8, !dbg !19
43
+ %35 = add i64 %32, 50257, !dbg !20
44
+ %36 = icmp slt i64 %28, 0, !dbg !21
45
+ %37 = icmp slt i64 %32, 0, !dbg !21
46
+ %38 = select i1 %37, i64 %35, i64 %32, !dbg !22
47
+ %39 = icmp ugt i64 %38, 50256, !dbg !23
48
+ %40 = shl i64 %28, 8, !dbg !24
49
+ %41 = add i64 %40, 12865792, !dbg !24
50
+ %42 = select i1 %36, i64 %41, i64 %40, !dbg !24
51
+ %43 = getelementptr float, ptr addrspace(1) %1, i64 %42
52
+ br label %44, !dbg !12
53
+
54
+ 44: ; preds = %7, %76
55
+ %45 = phi float [ 0.000000e+00, %7 ], [ %96, %76 ]
56
+ %46 = phi float [ 0.000000e+00, %7 ], [ %97, %76 ]
57
+ %47 = phi float [ 0.000000e+00, %7 ], [ %98, %76 ]
58
+ %48 = phi float [ 0.000000e+00, %7 ], [ %99, %76 ]
59
+ %49 = phi float [ 0.000000e+00, %7 ], [ %100, %76 ]
60
+ %50 = phi float [ 0.000000e+00, %7 ], [ %101, %76 ]
61
+ %51 = phi float [ 0.000000e+00, %7 ], [ %102, %76 ]
62
+ %52 = phi float [ 0.000000e+00, %7 ], [ %103, %76 ]
63
+ %53 = phi float [ 0.000000e+00, %7 ], [ %120, %76 ]
64
+ %54 = phi float [ 0.000000e+00, %7 ], [ %121, %76 ]
65
+ %55 = phi float [ 0.000000e+00, %7 ], [ %122, %76 ]
66
+ %56 = phi float [ 0.000000e+00, %7 ], [ %123, %76 ]
67
+ %57 = phi float [ 0.000000e+00, %7 ], [ %108, %76 ]
68
+ %58 = phi float [ 0.000000e+00, %7 ], [ %109, %76 ]
69
+ %59 = phi float [ 0.000000e+00, %7 ], [ %110, %76 ]
70
+ %60 = phi float [ 0.000000e+00, %7 ], [ %111, %76 ]
71
+ %61 = phi i32 [ 0, %7 ], [ %124, %76 ]
72
+ %62 = or i32 %61, %17, !dbg !25
73
+ %63 = add i32 %62, %34, !dbg !26
74
+ %64 = sext i32 %63 to i64, !dbg !27
75
+ %65 = getelementptr float, ptr addrspace(1) %2, i64 %64, !dbg !27
76
+ %66 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %65, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
77
+ %67 = extractvalue { i32, i32, i32, i32 } %66, 0, !dbg !28
78
+ %68 = extractvalue { i32, i32, i32, i32 } %66, 1, !dbg !28
79
+ %69 = extractvalue { i32, i32, i32, i32 } %66, 2, !dbg !28
80
+ %70 = extractvalue { i32, i32, i32, i32 } %66, 3, !dbg !28
81
+ %71 = bitcast i32 %67 to float, !dbg !28
82
+ %72 = bitcast i32 %68 to float, !dbg !28
83
+ %73 = bitcast i32 %69 to float, !dbg !28
84
+ %74 = bitcast i32 %70 to float, !dbg !28
85
+ br i1 %39, label %75, label %76, !dbg !29
86
+
87
+ 75: ; preds = %44
88
+ tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !29
89
+ br label %76, !dbg !29
90
+
91
+ 76: ; preds = %75, %44
92
+ %77 = zext nneg i32 %62 to i64, !dbg !30
93
+ %78 = getelementptr float, ptr addrspace(1) %43, i64 %77, !dbg !31
94
+ %79 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %78, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
95
+ %80 = extractvalue { i32, i32, i32, i32 } %79, 0, !dbg !32
96
+ %81 = extractvalue { i32, i32, i32, i32 } %79, 1, !dbg !32
97
+ %82 = extractvalue { i32, i32, i32, i32 } %79, 2, !dbg !32
98
+ %83 = extractvalue { i32, i32, i32, i32 } %79, 3, !dbg !32
99
+ %84 = bitcast i32 %80 to float, !dbg !32
100
+ %85 = bitcast i32 %81 to float, !dbg !32
101
+ %86 = bitcast i32 %82 to float, !dbg !32
102
+ %87 = bitcast i32 %83 to float, !dbg !32
103
+ %88 = fadd float %71, %84, !dbg !33
104
+ %89 = fadd float %72, %85, !dbg !33
105
+ %90 = fadd float %73, %86, !dbg !33
106
+ %91 = fadd float %74, %87, !dbg !33
107
+ %92 = fsub float %88, %57, !dbg !34
108
+ %93 = fsub float %89, %58, !dbg !34
109
+ %94 = fsub float %90, %59, !dbg !34
110
+ %95 = fsub float %91, %60, !dbg !34
111
+ %96 = fadd float %45, 1.000000e+00, !dbg !38
112
+ %97 = fadd float %46, 1.000000e+00, !dbg !38
113
+ %98 = fadd float %47, 1.000000e+00, !dbg !38
114
+ %99 = fadd float %48, 1.000000e+00, !dbg !38
115
+ %100 = fadd float %49, 1.000000e+00, !dbg !38
116
+ %101 = fadd float %50, 1.000000e+00, !dbg !38
117
+ %102 = fadd float %51, 1.000000e+00, !dbg !38
118
+ %103 = fadd float %52, 1.000000e+00, !dbg !38
119
+ %104 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %92, float %96) #6, !dbg !39
120
+ %105 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %93, float %97) #6, !dbg !39
121
+ %106 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %94, float %98) #6, !dbg !39
122
+ %107 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %95, float %99) #6, !dbg !39
123
+ %108 = fadd float %57, %104, !dbg !40
124
+ %109 = fadd float %58, %105, !dbg !40
125
+ %110 = fadd float %59, %106, !dbg !40
126
+ %111 = fadd float %60, %107, !dbg !40
127
+ %112 = fsub float %88, %108, !dbg !41
128
+ %113 = fsub float %89, %109, !dbg !41
129
+ %114 = fsub float %90, %110, !dbg !41
130
+ %115 = fsub float %91, %111, !dbg !41
131
+ %116 = fmul float %92, %112, !dbg !42
132
+ %117 = fmul float %93, %113, !dbg !42
133
+ %118 = fmul float %94, %114, !dbg !42
134
+ %119 = fmul float %95, %115, !dbg !42
135
+ %120 = fadd float %53, %116, !dbg !43
136
+ %121 = fadd float %54, %117, !dbg !43
137
+ %122 = fadd float %55, %118, !dbg !43
138
+ %123 = fadd float %56, %119, !dbg !43
139
+ %124 = add nuw nsw i32 %61, 8, !dbg !12
140
+ %125 = icmp ult i32 %61, 248, !dbg !12
141
+ br i1 %125, label %44, label %126, !dbg !12
142
+
143
+ 126: ; preds = %76
144
+ %127 = lshr i32 %9, 3, !dbg !12
145
+ %128 = or i32 %19, %127, !dbg !12
146
+ %129 = mul nuw nsw i32 %128, 12, !dbg !12
147
+ %130 = add nuw nsw i32 %129, %18, !dbg !12
148
+ %131 = zext nneg i32 %130 to i64, !dbg !12
149
+ %132 = getelementptr float, ptr addrspace(3) @global_smem, i64 %131, !dbg !12
150
+ %133 = insertelement <1 x float> undef, float %100, i64 0, !dbg !12
151
+ store <1 x float> %133, ptr addrspace(3) %132, align 4, !dbg !12
152
+ %134 = or i32 %18, 192, !dbg !12
153
+ %135 = add nuw nsw i32 %134, %129, !dbg !12
154
+ %136 = zext nneg i32 %135 to i64, !dbg !12
155
+ %137 = getelementptr float, ptr addrspace(3) @global_smem, i64 %136, !dbg !12
156
+ %138 = insertelement <1 x float> undef, float %101, i64 0, !dbg !12
157
+ store <1 x float> %138, ptr addrspace(3) %137, align 4, !dbg !12
158
+ %139 = or i32 %18, 384, !dbg !12
159
+ %140 = add nuw nsw i32 %139, %129, !dbg !12
160
+ %141 = zext nneg i32 %140 to i64, !dbg !12
161
+ %142 = getelementptr float, ptr addrspace(3) @global_smem, i64 %141, !dbg !12
162
+ %143 = insertelement <1 x float> undef, float %102, i64 0, !dbg !12
163
+ store <1 x float> %143, ptr addrspace(3) %142, align 4, !dbg !12
164
+ %144 = or i32 %18, 576, !dbg !12
165
+ %145 = add nuw nsw i32 %144, %129, !dbg !12
166
+ %146 = zext nneg i32 %145 to i64, !dbg !12
167
+ %147 = getelementptr float, ptr addrspace(3) @global_smem, i64 %146, !dbg !12
168
+ %148 = insertelement <1 x float> undef, float %103, i64 0, !dbg !12
169
+ store <1 x float> %148, ptr addrspace(3) %147, align 4, !dbg !12
170
+ tail call void @llvm.nvvm.barrier0(), !dbg !12
171
+ %149 = mul nuw nsw i32 %14, 12, !dbg !12
172
+ %150 = add nuw nsw i32 %149, %17, !dbg !12
173
+ %151 = zext nneg i32 %150 to i64, !dbg !12
174
+ %152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !12
175
+ %153 = load float, ptr addrspace(3) %152, align 16, !dbg !12
176
+ %154 = getelementptr inbounds <4 x float>, ptr addrspace(3) %152, i64 0, i64 1, !dbg !12
177
+ %155 = load float, ptr addrspace(3) %154, align 4, !dbg !12
178
+ %156 = getelementptr inbounds <4 x float>, ptr addrspace(3) %152, i64 0, i64 2, !dbg !12
179
+ %157 = load float, ptr addrspace(3) %156, align 8, !dbg !12
180
+ %158 = getelementptr inbounds <4 x float>, ptr addrspace(3) %152, i64 0, i64 3, !dbg !12
181
+ %159 = load float, ptr addrspace(3) %158, align 4, !dbg !12
182
+ %160 = fsub float %109, %108, !dbg !44
183
+ %161 = fadd float %153, %155, !dbg !48
184
+ %162 = fcmp oeq float %161, 0.000000e+00, !dbg !49
185
+ %163 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %155, float %161) #6, !dbg !50
186
+ %164 = select i1 %162, float 0.000000e+00, float %163, !dbg !51
187
+ %165 = fmul float %160, %164, !dbg !52
188
+ %166 = fadd float %108, %165, !dbg !53
189
+ %167 = fadd float %120, %121, !dbg !54
190
+ %168 = fmul float %160, %160, !dbg !55
191
+ %169 = fmul float %168, %153, !dbg !56
192
+ %170 = fmul float %169, %164, !dbg !57
193
+ %171 = fadd float %167, %170, !dbg !58
194
+ %172 = fsub float %110, %166, !dbg !44
195
+ %173 = fadd float %157, %161, !dbg !48
196
+ %174 = fcmp oeq float %173, 0.000000e+00, !dbg !49
197
+ %175 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %157, float %173) #6, !dbg !50
198
+ %176 = select i1 %174, float 0.000000e+00, float %175, !dbg !51
199
+ %177 = fmul float %176, %172, !dbg !52
200
+ %178 = fadd float %166, %177, !dbg !53
201
+ %179 = fadd float %122, %171, !dbg !54
202
+ %180 = fmul float %172, %172, !dbg !55
203
+ %181 = fmul float %161, %180, !dbg !56
204
+ %182 = fmul float %176, %181, !dbg !57
205
+ %183 = fadd float %179, %182, !dbg !58
206
+ %184 = fsub float %111, %178, !dbg !44
207
+ %185 = fadd float %159, %173, !dbg !48
208
+ %186 = fcmp oeq float %185, 0.000000e+00, !dbg !49
209
+ %187 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %159, float %185) #6, !dbg !50
210
+ %188 = select i1 %186, float 0.000000e+00, float %187, !dbg !51
211
+ %189 = fmul float %188, %184, !dbg !52
212
+ %190 = fadd float %178, %189, !dbg !53
213
+ %191 = fadd float %123, %183, !dbg !54
214
+ %192 = fmul float %184, %184, !dbg !55
215
+ %193 = fmul float %173, %192, !dbg !56
216
+ %194 = fmul float %188, %193, !dbg !57
217
+ %195 = fadd float %191, %194, !dbg !58
218
+ %196 = bitcast float %190 to i32, !dbg !59
219
+ %197 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 1, i32 31), !dbg !59
220
+ %198 = bitcast i32 %197 to float, !dbg !59
221
+ %199 = bitcast float %195 to i32, !dbg !59
222
+ %200 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %199, i32 1, i32 31), !dbg !59
223
+ %201 = bitcast i32 %200 to float, !dbg !59
224
+ %202 = bitcast float %185 to i32, !dbg !59
225
+ %203 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %202, i32 1, i32 31), !dbg !59
226
+ %204 = bitcast i32 %203 to float, !dbg !59
227
+ %205 = fsub float %198, %190, !dbg !44
228
+ %206 = fadd float %185, %204, !dbg !48
229
+ %207 = fcmp oeq float %206, 0.000000e+00, !dbg !49
230
+ %208 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %204, float %206) #6, !dbg !50
231
+ %209 = select i1 %207, float 0.000000e+00, float %208, !dbg !51
232
+ %210 = fmul float %209, %205, !dbg !52
233
+ %211 = fadd float %190, %210, !dbg !53
234
+ %212 = fadd float %195, %201, !dbg !54
235
+ %213 = fmul float %205, %205, !dbg !55
236
+ %214 = fmul float %185, %213, !dbg !56
237
+ %215 = fmul float %209, %214, !dbg !57
238
+ %216 = fadd float %212, %215, !dbg !58
239
+ %217 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %216, float 2.560000e+02) #6, !dbg !61
240
+ %218 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %216, float 2.560000e+02) #6, !dbg !61
241
+ %219 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %216, float 2.560000e+02) #6, !dbg !61
242
+ %220 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %216, float 2.560000e+02) #6, !dbg !61
243
+ %221 = fadd float %217, 0x3EE4F8B580000000, !dbg !62
244
+ %222 = shl i32 %22, 8, !dbg !63
245
+ br label %223, !dbg !64
246
+
247
+ 223: ; preds = %126, %__nv_rsqrtf.exit
248
+ %224 = phi i32 [ 0, %126 ], [ %298, %__nv_rsqrtf.exit ]
249
+ %225 = or i32 %224, %17, !dbg !65
250
+ %226 = add i32 %225, %34, !dbg !66
251
+ %227 = sext i32 %226 to i64, !dbg !67
252
+ %228 = getelementptr float, ptr addrspace(1) %2, i64 %227, !dbg !67
253
+ %229 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %228, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
254
+ %230 = extractvalue { i32, i32, i32, i32 } %229, 0, !dbg !68
255
+ %231 = extractvalue { i32, i32, i32, i32 } %229, 1, !dbg !68
256
+ %232 = extractvalue { i32, i32, i32, i32 } %229, 2, !dbg !68
257
+ %233 = extractvalue { i32, i32, i32, i32 } %229, 3, !dbg !68
258
+ %234 = bitcast i32 %230 to float, !dbg !68
259
+ %235 = bitcast i32 %231 to float, !dbg !68
260
+ %236 = bitcast i32 %232 to float, !dbg !68
261
+ %237 = bitcast i32 %233 to float, !dbg !68
262
+ %238 = zext nneg i32 %225 to i64, !dbg !69
263
+ %239 = getelementptr float, ptr addrspace(1) %3, i64 %238, !dbg !69
264
+ %240 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %239, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !70
265
+ %241 = extractvalue { i32, i32, i32, i32 } %240, 0, !dbg !70
266
+ %242 = extractvalue { i32, i32, i32, i32 } %240, 1, !dbg !70
267
+ %243 = extractvalue { i32, i32, i32, i32 } %240, 2, !dbg !70
268
+ %244 = extractvalue { i32, i32, i32, i32 } %240, 3, !dbg !70
269
+ %245 = bitcast i32 %241 to float, !dbg !70
270
+ %246 = bitcast i32 %242 to float, !dbg !70
271
+ %247 = bitcast i32 %243 to float, !dbg !70
272
+ %248 = bitcast i32 %244 to float, !dbg !70
273
+ br i1 %39, label %249, label %250, !dbg !71
274
+
275
+ 249: ; preds = %223
276
+ tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !71
277
+ br label %250, !dbg !71
278
+
279
+ 250: ; preds = %249, %223
280
+ %251 = getelementptr float, ptr addrspace(1) %43, i64 %238, !dbg !72
281
+ %252 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %251, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
282
+ %253 = extractvalue { i32, i32, i32, i32 } %252, 0, !dbg !73
283
+ %254 = extractvalue { i32, i32, i32, i32 } %252, 1, !dbg !73
284
+ %255 = extractvalue { i32, i32, i32, i32 } %252, 2, !dbg !73
285
+ %256 = extractvalue { i32, i32, i32, i32 } %252, 3, !dbg !73
286
+ %257 = bitcast i32 %253 to float, !dbg !73
287
+ %258 = bitcast i32 %254 to float, !dbg !73
288
+ %259 = bitcast i32 %255 to float, !dbg !73
289
+ %260 = bitcast i32 %256 to float, !dbg !73
290
+ %261 = fadd float %234, %257, !dbg !74
291
+ %262 = fadd float %235, %258, !dbg !74
292
+ %263 = fadd float %236, %259, !dbg !74
293
+ %264 = fadd float %237, %260, !dbg !74
294
+ %265 = fsub float %261, %211, !dbg !75
295
+ %266 = fsub float %262, %211, !dbg !75
296
+ %267 = fsub float %263, %211, !dbg !75
297
+ %268 = fsub float %264, %211, !dbg !75
298
+ %269 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
299
+ %.not.i = icmp eq i32 %269, 0, !dbg !76
300
+ br i1 %.not.i, label %272, label %270, !dbg !76
301
+
302
+ 270: ; preds = %250
303
+ %271 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %221), !dbg !76
304
+ br label %__nv_rsqrtf.exit, !dbg !76
305
+
306
+ 272: ; preds = %250
307
+ %273 = tail call float @llvm.nvvm.rsqrt.approx.f(float %221), !dbg !76
308
+ br label %__nv_rsqrtf.exit, !dbg !76
309
+
310
+ __nv_rsqrtf.exit: ; preds = %270, %272
311
+ %.0.i = phi float [ %271, %270 ], [ %273, %272 ], !dbg !76
312
+ %274 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
313
+ %275 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
314
+ %276 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
315
+ %277 = fmul float %265, %.0.i, !dbg !77
316
+ %278 = fmul float %266, %.0.i, !dbg !77
317
+ %279 = fmul float %267, %.0.i, !dbg !77
318
+ %280 = fmul float %268, %.0.i, !dbg !77
319
+ %281 = fmul float %277, %245, !dbg !78
320
+ %282 = fmul float %278, %246, !dbg !78
321
+ %283 = fmul float %279, %247, !dbg !78
322
+ %284 = fmul float %280, %248, !dbg !78
323
+ %285 = add i32 %225, %222, !dbg !79
324
+ %286 = sext i32 %285 to i64, !dbg !80
325
+ %287 = getelementptr i16, ptr addrspace(1) %4, i64 %286, !dbg !80
326
+ %288 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %281) #6, !dbg !81
327
+ %289 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %282) #6, !dbg !81
328
+ %290 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %283) #6, !dbg !81
329
+ %291 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %284) #6, !dbg !81
330
+ %292 = insertelement <2 x i16> undef, i16 %288, i64 0, !dbg !81
331
+ %293 = insertelement <2 x i16> %292, i16 %289, i64 1, !dbg !81
332
+ %294 = bitcast <2 x i16> %293 to i32, !dbg !81
333
+ %295 = insertelement <2 x i16> undef, i16 %290, i64 0, !dbg !81
334
+ %296 = insertelement <2 x i16> %295, i16 %291, i64 1, !dbg !81
335
+ %297 = bitcast <2 x i16> %296 to i32, !dbg !81
336
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %294, i32 %297, ptr addrspace(1) %287, i1 true) #6, !dbg !81
337
+ %298 = add nuw nsw i32 %224, 8, !dbg !64
338
+ %299 = icmp ult i32 %224, 248, !dbg !64
339
+ br i1 %299, label %223, label %300, !dbg !64
340
+
341
+ 300: ; preds = %__nv_rsqrtf.exit
342
+ ret void, !dbg !82
343
+ }
344
+
345
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
346
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
347
+
348
+ ; Function Attrs: convergent nocallback nounwind
349
+ declare void @llvm.nvvm.barrier0() #1
350
+
351
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
352
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
353
+
354
+ ; Function Attrs: alwaysinline nounwind
355
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
356
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
357
+ %.not = icmp eq i32 %1, 0
358
+ br i1 %.not, label %4, label %2
359
+
360
+ 2: ; preds = %0
361
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
362
+ br label %6
363
+
364
+ 4: ; preds = %0
365
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
366
+ br label %6
367
+
368
+ 6: ; preds = %4, %2
369
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
370
+ ret float %.0
371
+ }
372
+
373
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
374
+
375
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
376
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
377
+
378
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
379
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
380
+
381
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
382
+ attributes #1 = { convergent nocallback nounwind }
383
+ attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
384
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
385
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
386
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
387
+ attributes #6 = { nounwind }
388
+
389
+ !llvm.module.flags = !{!0, !1}
390
+ !llvm.dbg.cu = !{!2}
391
+ !nvvm.annotations = !{!4, !5, !5, !4}
392
+ !llvm.ident = !{!6}
393
+
394
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
395
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
396
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
397
+ !3 = !DIFile(filename: "cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py", directory: "/tmp/torchinductor_root/gx")
398
+ !4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
399
+ !5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 128}
400
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
401
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
402
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
403
+ !9 = !{}
404
+ !10 = !DILocation(line: 22, column: 44, scope: !7)
405
+ !11 = !DILocation(line: 24, column: 33, scope: !7)
406
+ !12 = !DILocation(line: 31, column: 36, scope: !7)
407
+ !13 = !DILocation(line: 21, column: 28, scope: !7)
408
+ !14 = !DILocation(line: 21, column: 33, scope: !7)
409
+ !15 = !DILocation(line: 22, column: 23, scope: !7)
410
+ !16 = !DILocation(line: 26, column: 30, scope: !7)
411
+ !17 = !DILocation(line: 26, column: 35, scope: !7)
412
+ !18 = !DILocation(line: 27, column: 18, scope: !7)
413
+ !19 = !DILocation(line: 35, column: 44, scope: !7)
414
+ !20 = !DILocation(line: 36, column: 22, scope: !7)
415
+ !21 = !DILocation(line: 37, column: 22, scope: !7)
416
+ !22 = !DILocation(line: 38, column: 36, scope: !7)
417
+ !23 = !DILocation(line: 39, column: 40, scope: !7)
418
+ !24 = !DILocation(line: 40, column: 44, scope: !7)
419
+ !25 = !DILocation(line: 32, column: 27, scope: !7)
420
+ !26 = !DILocation(line: 35, column: 40, scope: !7)
421
+ !27 = !DILocation(line: 35, column: 34, scope: !7)
422
+ !28 = !DILocation(line: 35, column: 50, scope: !7)
423
+ !29 = !DILocation(line: 39, column: 55, scope: !7)
424
+ !30 = !DILocation(line: 40, column: 40, scope: !7)
425
+ !31 = !DILocation(line: 40, column: 34, scope: !7)
426
+ !32 = !DILocation(line: 40, column: 52, scope: !7)
427
+ !33 = !DILocation(line: 41, column: 22, scope: !7)
428
+ !34 = !DILocation(line: 96, column: 20, scope: !35, inlinedAt: !37)
429
+ !35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
430
+ !36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
431
+ !37 = !DILocation(line: 44, column: 38, scope: !35)
432
+ !38 = !DILocation(line: 97, column: 26, scope: !35, inlinedAt: !37)
433
+ !39 = !DILocation(line: 98, column: 30, scope: !35, inlinedAt: !37)
434
+ !40 = !DILocation(line: 98, column: 22, scope: !35, inlinedAt: !37)
435
+ !41 = !DILocation(line: 101, column: 30, scope: !35, inlinedAt: !37)
436
+ !42 = !DILocation(line: 101, column: 22, scope: !35, inlinedAt: !37)
437
+ !43 = !DILocation(line: 47, column: 48, scope: !7)
438
+ !44 = !DILocation(line: 108, column: 21, scope: !45, inlinedAt: !46)
439
+ !45 = distinct !DILexicalBlockFile(scope: !35, file: !36, discriminator: 0)
440
+ !46 = !DILocation(line: 120, column: 46, scope: !45, inlinedAt: !47)
441
+ !47 = !DILocation(line: 50, column: 41, scope: !45)
442
+ !48 = !DILocation(line: 109, column: 28, scope: !45, inlinedAt: !46)
443
+ !49 = !DILocation(line: 110, column: 39, scope: !45, inlinedAt: !46)
444
+ !50 = !DILocation(line: 110, column: 60, scope: !45, inlinedAt: !46)
445
+ !51 = !DILocation(line: 110, column: 49, scope: !45, inlinedAt: !46)
446
+ !52 = !DILocation(line: 112, column: 25, scope: !45, inlinedAt: !46)
447
+ !53 = !DILocation(line: 112, column: 17, scope: !45, inlinedAt: !46)
448
+ !54 = !DILocation(line: 113, column: 15, scope: !45, inlinedAt: !46)
449
+ !55 = !DILocation(line: 113, column: 30, scope: !45, inlinedAt: !46)
450
+ !56 = !DILocation(line: 113, column: 38, scope: !45, inlinedAt: !46)
451
+ !57 = !DILocation(line: 113, column: 49, scope: !45, inlinedAt: !46)
452
+ !58 = !DILocation(line: 113, column: 22, scope: !45, inlinedAt: !46)
453
+ !59 = !DILocation(line: 120, column: 46, scope: !35, inlinedAt: !60)
454
+ !60 = !DILocation(line: 50, column: 41, scope: !35)
455
+ !61 = !DILocation(line: 69, column: 23, scope: !7)
456
+ !62 = !DILocation(line: 71, column: 24, scope: !7)
457
+ !63 = !DILocation(line: 76, column: 39, scope: !7)
458
+ !64 = !DILocation(line: 55, column: 36, scope: !7)
459
+ !65 = !DILocation(line: 56, column: 27, scope: !7)
460
+ !66 = !DILocation(line: 59, column: 41, scope: !7)
461
+ !67 = !DILocation(line: 59, column: 35, scope: !7)
462
+ !68 = !DILocation(line: 59, column: 51, scope: !7)
463
+ !69 = !DILocation(line: 60, column: 35, scope: !7)
464
+ !70 = !DILocation(line: 60, column: 40, scope: !7)
465
+ !71 = !DILocation(line: 64, column: 57, scope: !7)
466
+ !72 = !DILocation(line: 65, column: 35, scope: !7)
467
+ !73 = !DILocation(line: 65, column: 54, scope: !7)
468
+ !74 = !DILocation(line: 66, column: 24, scope: !7)
469
+ !75 = !DILocation(line: 67, column: 24, scope: !7)
470
+ !76 = !DILocation(line: 72, column: 30, scope: !7)
471
+ !77 = !DILocation(line: 73, column: 24, scope: !7)
472
+ !78 = !DILocation(line: 74, column: 24, scope: !7)
473
+ !79 = !DILocation(line: 76, column: 35, scope: !7)
474
+ !80 = !DILocation(line: 76, column: 29, scope: !7)
475
+ !81 = !DILocation(line: 76, column: 52, scope: !7)
476
+ !82 = !DILocation(line: 55, column: 4, scope: !7)