; ModuleID = 'LLVMDialectModule' source_filename = "LLVMDialectModule" @global_smem = external addrspace(3) global [0 x i8] define void @triton__0d1d2d3d4d5d6d7de8(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i64 %7, i64 %8) local_unnamed_addr !dbg !5 { %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 %11 = lshr i32 %10, 5, !dbg !8 %urem = and i32 %10, 255, !dbg !9 %12 = or i32 %urem, 256, !dbg !9 %13 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10 %14 = sext i32 %13 to i64, !dbg !11 %15 = shl nsw i64 %14, 3, !dbg !12 %16 = or i64 %15, 1, !dbg !13 %17 = or i64 %15, 2, !dbg !13 %18 = or i64 %15, 3, !dbg !13 %19 = or i64 %15, 4, !dbg !13 %20 = or i64 %15, 5, !dbg !13 %21 = or i64 %15, 6, !dbg !13 %22 = or i64 %15, 7, !dbg !13 %23 = insertelement <2 x i32> poison, i32 %urem, i64 0 %24 = insertelement <2 x i32> %23, i32 %12, i64 1 %25 = zext nneg <2 x i32> %24 to <2 x i64> %26 = getelementptr i64, ptr addrspace(1) %1, i64 %15, !dbg !14 %27 = getelementptr i64, ptr addrspace(1) %1, i64 %16, !dbg !14 %28 = getelementptr i64, ptr addrspace(1) %1, i64 %17, !dbg !14 %29 = getelementptr i64, ptr addrspace(1) %1, i64 %18, !dbg !14 %30 = getelementptr i64, ptr addrspace(1) %1, i64 %19, !dbg !14 %31 = getelementptr i64, ptr addrspace(1) %1, i64 %20, !dbg !14 %32 = getelementptr i64, ptr addrspace(1) %1, i64 %21, !dbg !14 %33 = getelementptr i64, ptr addrspace(1) %1, i64 %22, !dbg !14 %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #3, !dbg !15 %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %27, i1 true) #3, !dbg !15 %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %28, i1 true) #3, !dbg !15 %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %29, i1 true) #3, !dbg !15 %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %30, i1 true) #3, !dbg !15 %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %31, i1 true) #3, !dbg !15 %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %32, i1 true) #3, !dbg !15 %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %33, i1 true) #3, !dbg !15 %42 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %2, i1 true) #3, !dbg !16 %43 = bitcast i32 %42 to float, !dbg !16 %44 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %3, i1 true) #3, !dbg !17 %45 = bitcast i32 %44 to float, !dbg !17 %46 = mul nsw i64 %14, 402056, !dbg !18 %47 = mul nsw i64 %16, 50257, !dbg !18 %48 = mul nsw i64 %17, 50257, !dbg !18 %49 = mul nsw i64 %18, 50257, !dbg !18 %50 = mul nsw i64 %19, 50257, !dbg !18 %51 = mul nsw i64 %20, 50257, !dbg !18 %52 = mul nsw i64 %21, 50257, !dbg !18 %53 = mul nsw i64 %22, 50257, !dbg !18 %54 = insertelement <8 x i64> poison, i64 %34, i64 0, !dbg !19 %55 = insertelement <8 x i64> %54, i64 %35, i64 1, !dbg !19 %56 = insertelement <8 x i64> %55, i64 %36, i64 2, !dbg !19 %57 = insertelement <8 x i64> %56, i64 %37, i64 3, !dbg !19 %58 = insertelement <8 x i64> %57, i64 %38, i64 4, !dbg !19 %59 = insertelement <8 x i64> %58, i64 %39, i64 5, !dbg !19 %60 = insertelement <8 x i64> %59, i64 %40, i64 6, !dbg !19 %61 = insertelement <8 x i64> %60, i64 %41, i64 7, !dbg !19 %62 = icmp eq <8 x i64> %61, , !dbg !19 %63 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %43, float %45) #3, !dbg !20 %64 = insertelement <8 x float> poison, float %63, i64 0, !dbg !21 %65 = shufflevector <8 x float> %64, <8 x float> poison, <8 x i32> zeroinitializer, !dbg !21 %66 = select <8 x i1> %62, <8 x float> zeroinitializer, <8 x float> %65, !dbg !21 %67 = shufflevector <8 x float> %66, <8 x float> poison, <16 x i32> , !dbg !21 br label %68, !dbg !22 68: ; preds = %9, %68 %69 = phi i32 [ 0, %9 ], [ %135, %68 ] %70 = phi <16 x float> [ zeroinitializer, %9 ], [ %134, %68 ] %71 = zext nneg i32 %69 to i64, !dbg !23 %72 = insertelement <2 x i64> poison, i64 %71, i64 0, !dbg !23 %73 = shufflevector <2 x i64> %72, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !23 %74 = or <2 x i64> %73, %25, !dbg !23 %75 = icmp ult <2 x i64> %74, , !dbg !24 %76 = shufflevector <2 x i1> %75, <2 x i1> poison, <16 x i32> , !dbg !24 %77 = extractelement <2 x i64> %74, i64 0, !dbg !25 %78 = getelementptr float, ptr addrspace(1) %0, i64 %77, !dbg !25 %79 = getelementptr float, ptr addrspace(1) %78, i64 %46, !dbg !25 %80 = extractelement <2 x i64> %74, i64 1, !dbg !25 %81 = getelementptr float, ptr addrspace(1) %0, i64 %80, !dbg !25 %82 = getelementptr float, ptr addrspace(1) %81, i64 %46, !dbg !25 %83 = getelementptr float, ptr addrspace(1) %78, i64 %47, !dbg !25 %84 = getelementptr float, ptr addrspace(1) %81, i64 %47, !dbg !25 %85 = getelementptr float, ptr addrspace(1) %78, i64 %48, !dbg !25 %86 = getelementptr float, ptr addrspace(1) %81, i64 %48, !dbg !25 %87 = getelementptr float, ptr addrspace(1) %78, i64 %49, !dbg !25 %88 = getelementptr float, ptr addrspace(1) %81, i64 %49, !dbg !25 %89 = getelementptr float, ptr addrspace(1) %78, i64 %50, !dbg !25 %90 = getelementptr float, ptr addrspace(1) %81, i64 %50, !dbg !25 %91 = getelementptr float, ptr addrspace(1) %78, i64 %51, !dbg !25 %92 = getelementptr float, ptr addrspace(1) %81, i64 %51, !dbg !25 %93 = getelementptr float, ptr addrspace(1) %78, i64 %52, !dbg !25 %94 = getelementptr float, ptr addrspace(1) %81, i64 %52, !dbg !25 %95 = getelementptr float, ptr addrspace(1) %78, i64 %53, !dbg !25 %96 = getelementptr float, ptr addrspace(1) %81, i64 %53, !dbg !25 %97 = extractelement <2 x i1> %75, i64 0, !dbg !26 %98 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %79, i1 %97, i32 0, i1 %97) #3, !dbg !26 %99 = extractelement <2 x i1> %75, i64 1, !dbg !26 %100 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %82, i1 %99, i32 0, i1 %99) #3, !dbg !26 %101 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %83, i1 %97, i32 0, i1 %97) #3, !dbg !26 %102 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %84, i1 %99, i32 0, i1 %99) #3, !dbg !26 %103 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %85, i1 %97, i32 0, i1 %97) #3, !dbg !26 %104 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %86, i1 %99, i32 0, i1 %99) #3, !dbg !26 %105 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %87, i1 %97, i32 0, i1 %97) #3, !dbg !26 %106 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %88, i1 %99, i32 0, i1 %99) #3, !dbg !26 %107 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %89, i1 %97, i32 0, i1 %97) #3, !dbg !26 %108 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %90, i1 %99, i32 0, i1 %99) #3, !dbg !26 %109 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %91, i1 %97, i32 0, i1 %97) #3, !dbg !26 %110 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %92, i1 %99, i32 0, i1 %99) #3, !dbg !26 %111 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %93, i1 %97, i32 0, i1 %97) #3, !dbg !26 %112 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %94, i1 %99, i32 0, i1 %99) #3, !dbg !26 %113 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %95, i1 %97, i32 0, i1 %97) #3, !dbg !26 %114 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %96, i1 %99, i32 0, i1 %99) #3, !dbg !26 %115 = insertelement <16 x i32> poison, i32 %98, i64 0, !dbg !26 %116 = insertelement <16 x i32> %115, i32 %100, i64 1, !dbg !26 %117 = insertelement <16 x i32> %116, i32 %101, i64 2, !dbg !26 %118 = insertelement <16 x i32> %117, i32 %102, i64 3, !dbg !26 %119 = insertelement <16 x i32> %118, i32 %103, i64 4, !dbg !26 %120 = insertelement <16 x i32> %119, i32 %104, i64 5, !dbg !26 %121 = insertelement <16 x i32> %120, i32 %105, i64 6, !dbg !26 %122 = insertelement <16 x i32> %121, i32 %106, i64 7, !dbg !26 %123 = insertelement <16 x i32> %122, i32 %107, i64 8, !dbg !26 %124 = insertelement <16 x i32> %123, i32 %108, i64 9, !dbg !26 %125 = insertelement <16 x i32> %124, i32 %109, i64 10, !dbg !26 %126 = insertelement <16 x i32> %125, i32 %110, i64 11, !dbg !26 %127 = insertelement <16 x i32> %126, i32 %111, i64 12, !dbg !26 %128 = insertelement <16 x i32> %127, i32 %112, i64 13, !dbg !26 %129 = insertelement <16 x i32> %128, i32 %113, i64 14, !dbg !26 %130 = insertelement <16 x i32> %129, i32 %114, i64 15, !dbg !26 %131 = bitcast <16 x i32> %130 to <16 x float>, !dbg !26 %132 = fmul <16 x float> %67, %131, !dbg !27 %133 = select <16 x i1> %76, <16 x float> %132, <16 x float> , !dbg !28 %134 = fadd <16 x float> %70, %133, !dbg !28 %135 = add nuw nsw i32 %69, 512, !dbg !22 %136 = icmp ult i32 %69, 49745, !dbg !22 br i1 %136, label %68, label %137, !dbg !22 137: ; preds = %68 %138 = and i32 %10, 31, !dbg !8 %139 = and i32 %11, 7, !dbg !9 %shift = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> , !dbg !29 %140 = fadd <16 x float> %134, %shift, !dbg !29 %141 = extractelement <16 x float> %140, i64 0, !dbg !29 %shift54 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> , !dbg !29 %142 = fadd <16 x float> %134, %shift54, !dbg !29 %143 = extractelement <16 x float> %142, i64 2, !dbg !29 %shift55 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> , !dbg !29 %144 = fadd <16 x float> %134, %shift55, !dbg !29 %145 = extractelement <16 x float> %144, i64 4, !dbg !29 %shift56 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> , !dbg !29 %146 = fadd <16 x float> %134, %shift56, !dbg !29 %147 = extractelement <16 x float> %146, i64 6, !dbg !29 %shift57 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> , !dbg !29 %148 = fadd <16 x float> %134, %shift57, !dbg !29 %149 = extractelement <16 x float> %148, i64 8, !dbg !29 %shift58 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> , !dbg !29 %150 = fadd <16 x float> %134, %shift58, !dbg !29 %151 = extractelement <16 x float> %150, i64 10, !dbg !29 %shift59 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> , !dbg !29 %152 = fadd <16 x float> %134, %shift59, !dbg !29 %153 = extractelement <16 x float> %152, i64 12, !dbg !29 %shift60 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> , !dbg !29 %154 = fadd <16 x float> %134, %shift60, !dbg !29 %155 = extractelement <16 x float> %154, i64 14, !dbg !29 %156 = bitcast float %141 to i32, !dbg !35 %157 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %156, i32 16, i32 31), !dbg !35 %158 = bitcast i32 %157 to float, !dbg !35 %159 = fadd float %141, %158, !dbg !29 %160 = bitcast float %159 to i32, !dbg !35 %161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %160, i32 8, i32 31), !dbg !35 %162 = bitcast i32 %161 to float, !dbg !35 %163 = fadd float %159, %162, !dbg !29 %164 = bitcast float %163 to i32, !dbg !35 %165 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %164, i32 4, i32 31), !dbg !35 %166 = bitcast i32 %165 to float, !dbg !35 %167 = fadd float %163, %166, !dbg !29 %168 = bitcast float %167 to i32, !dbg !35 %169 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %168, i32 2, i32 31), !dbg !35 %170 = bitcast i32 %169 to float, !dbg !35 %171 = fadd float %167, %170, !dbg !29 %172 = bitcast float %171 to i32, !dbg !35 %173 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %172, i32 1, i32 31), !dbg !35 %174 = bitcast i32 %173 to float, !dbg !35 %175 = fadd float %171, %174, !dbg !29 %176 = bitcast float %143 to i32, !dbg !35 %177 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %176, i32 16, i32 31), !dbg !35 %178 = bitcast i32 %177 to float, !dbg !35 %179 = fadd float %143, %178, !dbg !29 %180 = bitcast float %179 to i32, !dbg !35 %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %180, i32 8, i32 31), !dbg !35 %182 = bitcast i32 %181 to float, !dbg !35 %183 = fadd float %179, %182, !dbg !29 %184 = bitcast float %183 to i32, !dbg !35 %185 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %184, i32 4, i32 31), !dbg !35 %186 = bitcast i32 %185 to float, !dbg !35 %187 = fadd float %183, %186, !dbg !29 %188 = bitcast float %187 to i32, !dbg !35 %189 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %188, i32 2, i32 31), !dbg !35 %190 = bitcast i32 %189 to float, !dbg !35 %191 = fadd float %187, %190, !dbg !29 %192 = bitcast float %191 to i32, !dbg !35 %193 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %192, i32 1, i32 31), !dbg !35 %194 = bitcast i32 %193 to float, !dbg !35 %195 = fadd float %191, %194, !dbg !29 %196 = bitcast float %145 to i32, !dbg !35 %197 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 16, i32 31), !dbg !35 %198 = bitcast i32 %197 to float, !dbg !35 %199 = fadd float %145, %198, !dbg !29 %200 = bitcast float %199 to i32, !dbg !35 %201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 8, i32 31), !dbg !35 %202 = bitcast i32 %201 to float, !dbg !35 %203 = fadd float %199, %202, !dbg !29 %204 = bitcast float %203 to i32, !dbg !35 %205 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %204, i32 4, i32 31), !dbg !35 %206 = bitcast i32 %205 to float, !dbg !35 %207 = fadd float %203, %206, !dbg !29 %208 = bitcast float %207 to i32, !dbg !35 %209 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %208, i32 2, i32 31), !dbg !35 %210 = bitcast i32 %209 to float, !dbg !35 %211 = fadd float %207, %210, !dbg !29 %212 = bitcast float %211 to i32, !dbg !35 %213 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %212, i32 1, i32 31), !dbg !35 %214 = bitcast i32 %213 to float, !dbg !35 %215 = fadd float %211, %214, !dbg !29 %216 = bitcast float %147 to i32, !dbg !35 %217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 16, i32 31), !dbg !35 %218 = bitcast i32 %217 to float, !dbg !35 %219 = fadd float %147, %218, !dbg !29 %220 = bitcast float %219 to i32, !dbg !35 %221 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %220, i32 8, i32 31), !dbg !35 %222 = bitcast i32 %221 to float, !dbg !35 %223 = fadd float %219, %222, !dbg !29 %224 = bitcast float %223 to i32, !dbg !35 %225 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %224, i32 4, i32 31), !dbg !35 %226 = bitcast i32 %225 to float, !dbg !35 %227 = fadd float %223, %226, !dbg !29 %228 = bitcast float %227 to i32, !dbg !35 %229 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %228, i32 2, i32 31), !dbg !35 %230 = bitcast i32 %229 to float, !dbg !35 %231 = fadd float %227, %230, !dbg !29 %232 = bitcast float %231 to i32, !dbg !35 %233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 1, i32 31), !dbg !35 %234 = bitcast i32 %233 to float, !dbg !35 %235 = fadd float %231, %234, !dbg !29 %236 = bitcast float %149 to i32, !dbg !35 %237 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %236, i32 16, i32 31), !dbg !35 %238 = bitcast i32 %237 to float, !dbg !35 %239 = fadd float %149, %238, !dbg !29 %240 = bitcast float %239 to i32, !dbg !35 %241 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %240, i32 8, i32 31), !dbg !35 %242 = bitcast i32 %241 to float, !dbg !35 %243 = fadd float %239, %242, !dbg !29 %244 = bitcast float %243 to i32, !dbg !35 %245 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %244, i32 4, i32 31), !dbg !35 %246 = bitcast i32 %245 to float, !dbg !35 %247 = fadd float %243, %246, !dbg !29 %248 = bitcast float %247 to i32, !dbg !35 %249 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %248, i32 2, i32 31), !dbg !35 %250 = bitcast i32 %249 to float, !dbg !35 %251 = fadd float %247, %250, !dbg !29 %252 = bitcast float %251 to i32, !dbg !35 %253 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %252, i32 1, i32 31), !dbg !35 %254 = bitcast i32 %253 to float, !dbg !35 %255 = fadd float %251, %254, !dbg !29 %256 = bitcast float %151 to i32, !dbg !35 %257 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %256, i32 16, i32 31), !dbg !35 %258 = bitcast i32 %257 to float, !dbg !35 %259 = fadd float %151, %258, !dbg !29 %260 = bitcast float %259 to i32, !dbg !35 %261 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %260, i32 8, i32 31), !dbg !35 %262 = bitcast i32 %261 to float, !dbg !35 %263 = fadd float %259, %262, !dbg !29 %264 = bitcast float %263 to i32, !dbg !35 %265 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %264, i32 4, i32 31), !dbg !35 %266 = bitcast i32 %265 to float, !dbg !35 %267 = fadd float %263, %266, !dbg !29 %268 = bitcast float %267 to i32, !dbg !35 %269 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %268, i32 2, i32 31), !dbg !35 %270 = bitcast i32 %269 to float, !dbg !35 %271 = fadd float %267, %270, !dbg !29 %272 = bitcast float %271 to i32, !dbg !35 %273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 1, i32 31), !dbg !35 %274 = bitcast i32 %273 to float, !dbg !35 %275 = fadd float %271, %274, !dbg !29 %276 = bitcast float %153 to i32, !dbg !35 %277 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %276, i32 16, i32 31), !dbg !35 %278 = bitcast i32 %277 to float, !dbg !35 %279 = fadd float %153, %278, !dbg !29 %280 = bitcast float %279 to i32, !dbg !35 %281 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %280, i32 8, i32 31), !dbg !35 %282 = bitcast i32 %281 to float, !dbg !35 %283 = fadd float %279, %282, !dbg !29 %284 = bitcast float %283 to i32, !dbg !35 %285 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %284, i32 4, i32 31), !dbg !35 %286 = bitcast i32 %285 to float, !dbg !35 %287 = fadd float %283, %286, !dbg !29 %288 = bitcast float %287 to i32, !dbg !35 %289 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %288, i32 2, i32 31), !dbg !35 %290 = bitcast i32 %289 to float, !dbg !35 %291 = fadd float %287, %290, !dbg !29 %292 = bitcast float %291 to i32, !dbg !35 %293 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %292, i32 1, i32 31), !dbg !35 %294 = bitcast i32 %293 to float, !dbg !35 %295 = fadd float %291, %294, !dbg !29 %296 = bitcast float %155 to i32, !dbg !35 %297 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %296, i32 16, i32 31), !dbg !35 %298 = bitcast i32 %297 to float, !dbg !35 %299 = fadd float %155, %298, !dbg !29 %300 = bitcast float %299 to i32, !dbg !35 %301 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %300, i32 8, i32 31), !dbg !35 %302 = bitcast i32 %301 to float, !dbg !35 %303 = fadd float %299, %302, !dbg !29 %304 = bitcast float %303 to i32, !dbg !35 %305 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %304, i32 4, i32 31), !dbg !35 %306 = bitcast i32 %305 to float, !dbg !35 %307 = fadd float %303, %306, !dbg !29 %308 = bitcast float %307 to i32, !dbg !35 %309 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %308, i32 2, i32 31), !dbg !35 %310 = bitcast i32 %309 to float, !dbg !35 %311 = fadd float %307, %310, !dbg !29 %312 = bitcast float %311 to i32, !dbg !35 %313 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %312, i32 1, i32 31), !dbg !35 %314 = bitcast i32 %313 to float, !dbg !35 %315 = fadd float %311, %314, !dbg !29 %316 = icmp eq i32 %138, 0, !dbg !35 %317 = zext nneg i32 %139 to i64, !dbg !35 %318 = getelementptr float, ptr addrspace(3) @global_smem, i64 %317, !dbg !35 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %318, float %175, i1 %316) #3, !dbg !35 %319 = or i32 %139, 8, !dbg !35 %320 = zext nneg i32 %319 to i64, !dbg !35 %321 = getelementptr float, ptr addrspace(3) @global_smem, i64 %320, !dbg !35 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %321, float %195, i1 %316) #3, !dbg !35 %322 = or i32 %139, 16, !dbg !35 %323 = zext nneg i32 %322 to i64, !dbg !35 %324 = getelementptr float, ptr addrspace(3) @global_smem, i64 %323, !dbg !35 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %324, float %215, i1 %316) #3, !dbg !35 %325 = or i32 %139, 24, !dbg !35 %326 = zext nneg i32 %325 to i64, !dbg !35 %327 = getelementptr float, ptr addrspace(3) @global_smem, i64 %326, !dbg !35 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %327, float %235, i1 %316) #3, !dbg !35 %328 = or i32 %139, 32, !dbg !35 %329 = zext nneg i32 %328 to i64, !dbg !35 %330 = getelementptr float, ptr addrspace(3) @global_smem, i64 %329, !dbg !35 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %330, float %255, i1 %316) #3, !dbg !35 %331 = or i32 %139, 40, !dbg !35 %332 = zext nneg i32 %331 to i64, !dbg !35 %333 = getelementptr float, ptr addrspace(3) @global_smem, i64 %332, !dbg !35 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %333, float %275, i1 %316) #3, !dbg !35 %334 = or i32 %139, 48, !dbg !35 %335 = zext nneg i32 %334 to i64, !dbg !35 %336 = getelementptr float, ptr addrspace(3) @global_smem, i64 %335, !dbg !35 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %336, float %295, i1 %316) #3, !dbg !35 %337 = or i32 %139, 56, !dbg !35 %338 = zext nneg i32 %337 to i64, !dbg !35 %339 = getelementptr float, ptr addrspace(3) @global_smem, i64 %338, !dbg !35 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %339, float %315, i1 %316) #3, !dbg !35 tail call void @llvm.nvvm.barrier0(), !dbg !35 %340 = icmp slt i32 %10, 64, !dbg !35 %341 = sext i32 %10 to i64, !dbg !35 %342 = getelementptr float, ptr addrspace(3) @global_smem, i64 %341, !dbg !35 %343 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %342, i1 %340) #3, !dbg !35 %344 = bitcast float %343 to i32, !dbg !35 %345 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %344, i32 4, i32 31), !dbg !35 %346 = bitcast i32 %345 to float, !dbg !35 %347 = fadd float %343, %346, !dbg !29 %348 = bitcast float %347 to i32, !dbg !35 %349 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %348, i32 2, i32 31), !dbg !35 %350 = bitcast i32 %349 to float, !dbg !35 %351 = fadd float %347, %350, !dbg !29 %352 = bitcast float %351 to i32, !dbg !35 %353 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %352, i32 1, i32 31), !dbg !35 %354 = bitcast i32 %353 to float, !dbg !35 %355 = fadd float %351, %354, !dbg !29 %356 = and i32 %10, 7, !dbg !35 %357 = icmp eq i32 %356, 0, !dbg !35 %358 = and i1 %340, %357, !dbg !35 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %342, float %355, i1 %358) #3, !dbg !35 tail call void @llvm.nvvm.barrier0(), !dbg !35 %359 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !35 %360 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), align 4, !dbg !35 %361 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 64), align 4, !dbg !35 %362 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 96), align 4, !dbg !35 %363 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 128), align 4, !dbg !35 %364 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 160), align 4, !dbg !35 %365 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 192), align 4, !dbg !35 %366 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 224), align 4, !dbg !35 %367 = extractelement <2 x i64> %25, i64 0, !dbg !37 %368 = extractelement <2 x i64> %25, i64 1, !dbg !37 %369 = extractelement <8 x float> %66, i64 0, !dbg !38 %370 = extractelement <8 x float> %66, i64 1, !dbg !38 %371 = extractelement <8 x float> %66, i64 2, !dbg !38 %372 = extractelement <8 x float> %66, i64 3, !dbg !38 %373 = extractelement <8 x float> %66, i64 4, !dbg !38 %374 = extractelement <8 x float> %66, i64 5, !dbg !38 %375 = extractelement <8 x float> %66, i64 6, !dbg !38 %376 = extractelement <8 x float> %66, i64 7, !dbg !38 br label %377, !dbg !39 377: ; preds = %137, %377 %378 = phi i32 [ 0, %137 ], [ %672, %377 ] %379 = zext nneg i32 %378 to i64, !dbg !37 %380 = or i64 %367, %379, !dbg !37 %381 = or i64 %368, %379, !dbg !37 %382 = icmp ult i64 %380, 50257, !dbg !40 %383 = icmp ult i64 %381, 50257, !dbg !40 %384 = add nsw i64 %380, %46, !dbg !41 %385 = add nsw i64 %381, %46, !dbg !41 %386 = add nsw i64 %380, %47, !dbg !41 %387 = add nsw i64 %381, %47, !dbg !41 %388 = add nsw i64 %380, %48, !dbg !41 %389 = add nsw i64 %381, %48, !dbg !41 %390 = add nsw i64 %380, %49, !dbg !41 %391 = add nsw i64 %381, %49, !dbg !41 %392 = add nsw i64 %380, %50, !dbg !41 %393 = add nsw i64 %381, %50, !dbg !41 %394 = add nsw i64 %380, %51, !dbg !41 %395 = add nsw i64 %381, %51, !dbg !41 %396 = add nsw i64 %380, %52, !dbg !41 %397 = add nsw i64 %381, %52, !dbg !41 %398 = add nsw i64 %380, %53, !dbg !41 %399 = add nsw i64 %381, %53, !dbg !41 %400 = getelementptr i16, ptr addrspace(1) %4, i64 %384, !dbg !42 %401 = getelementptr i16, ptr addrspace(1) %4, i64 %385, !dbg !42 %402 = getelementptr i16, ptr addrspace(1) %4, i64 %386, !dbg !42 %403 = getelementptr i16, ptr addrspace(1) %4, i64 %387, !dbg !42 %404 = getelementptr i16, ptr addrspace(1) %4, i64 %388, !dbg !42 %405 = getelementptr i16, ptr addrspace(1) %4, i64 %389, !dbg !42 %406 = getelementptr i16, ptr addrspace(1) %4, i64 %390, !dbg !42 %407 = getelementptr i16, ptr addrspace(1) %4, i64 %391, !dbg !42 %408 = getelementptr i16, ptr addrspace(1) %4, i64 %392, !dbg !42 %409 = getelementptr i16, ptr addrspace(1) %4, i64 %393, !dbg !42 %410 = getelementptr i16, ptr addrspace(1) %4, i64 %394, !dbg !42 %411 = getelementptr i16, ptr addrspace(1) %4, i64 %395, !dbg !42 %412 = getelementptr i16, ptr addrspace(1) %4, i64 %396, !dbg !42 %413 = getelementptr i16, ptr addrspace(1) %4, i64 %397, !dbg !42 %414 = getelementptr i16, ptr addrspace(1) %4, i64 %398, !dbg !42 %415 = getelementptr i16, ptr addrspace(1) %4, i64 %399, !dbg !42 %416 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %400, i1 %382, i16 0, i1 %382) #3, !dbg !43 %417 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %401, i1 %383, i16 0, i1 %383) #3, !dbg !43 %418 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %402, i1 %382, i16 0, i1 %382) #3, !dbg !43 %419 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %403, i1 %383, i16 0, i1 %383) #3, !dbg !43 %420 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %404, i1 %382, i16 0, i1 %382) #3, !dbg !43 %421 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %405, i1 %383, i16 0, i1 %383) #3, !dbg !43 %422 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %406, i1 %382, i16 0, i1 %382) #3, !dbg !43 %423 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %407, i1 %383, i16 0, i1 %383) #3, !dbg !43 %424 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %408, i1 %382, i16 0, i1 %382) #3, !dbg !43 %425 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %409, i1 %383, i16 0, i1 %383) #3, !dbg !43 %426 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %410, i1 %382, i16 0, i1 %382) #3, !dbg !43 %427 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %411, i1 %383, i16 0, i1 %383) #3, !dbg !43 %428 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %412, i1 %382, i16 0, i1 %382) #3, !dbg !43 %429 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %413, i1 %383, i16 0, i1 %383) #3, !dbg !43 %430 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %414, i1 %382, i16 0, i1 %382) #3, !dbg !43 %431 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %415, i1 %383, i16 0, i1 %383) #3, !dbg !43 %432 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %416) #3, !dbg !44 %433 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %417) #3, !dbg !44 %434 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %418) #3, !dbg !44 %435 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %419) #3, !dbg !44 %436 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %420) #3, !dbg !44 %437 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %421) #3, !dbg !44 %438 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %422) #3, !dbg !44 %439 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %423) #3, !dbg !44 %440 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %424) #3, !dbg !44 %441 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %425) #3, !dbg !44 %442 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %426) #3, !dbg !44 %443 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %427) #3, !dbg !44 %444 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %428) #3, !dbg !44 %445 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %429) #3, !dbg !44 %446 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %430) #3, !dbg !44 %447 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %431) #3, !dbg !44 %448 = getelementptr float, ptr addrspace(1) %0, i64 %384, !dbg !45 %449 = getelementptr float, ptr addrspace(1) %0, i64 %385, !dbg !45 %450 = getelementptr float, ptr addrspace(1) %0, i64 %386, !dbg !45 %451 = getelementptr float, ptr addrspace(1) %0, i64 %387, !dbg !45 %452 = getelementptr float, ptr addrspace(1) %0, i64 %388, !dbg !45 %453 = getelementptr float, ptr addrspace(1) %0, i64 %389, !dbg !45 %454 = getelementptr float, ptr addrspace(1) %0, i64 %390, !dbg !45 %455 = getelementptr float, ptr addrspace(1) %0, i64 %391, !dbg !45 %456 = getelementptr float, ptr addrspace(1) %0, i64 %392, !dbg !45 %457 = getelementptr float, ptr addrspace(1) %0, i64 %393, !dbg !45 %458 = getelementptr float, ptr addrspace(1) %0, i64 %394, !dbg !45 %459 = getelementptr float, ptr addrspace(1) %0, i64 %395, !dbg !45 %460 = getelementptr float, ptr addrspace(1) %0, i64 %396, !dbg !45 %461 = getelementptr float, ptr addrspace(1) %0, i64 %397, !dbg !45 %462 = getelementptr float, ptr addrspace(1) %0, i64 %398, !dbg !45 %463 = getelementptr float, ptr addrspace(1) %0, i64 %399, !dbg !45 %464 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %448, i1 %382, i32 0, i1 %382) #3, !dbg !46 %465 = bitcast i32 %464 to float, !dbg !46 %466 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %449, i1 %383, i32 0, i1 %383) #3, !dbg !46 %467 = bitcast i32 %466 to float, !dbg !46 %468 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %450, i1 %382, i32 0, i1 %382) #3, !dbg !46 %469 = bitcast i32 %468 to float, !dbg !46 %470 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %451, i1 %383, i32 0, i1 %383) #3, !dbg !46 %471 = bitcast i32 %470 to float, !dbg !46 %472 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %452, i1 %382, i32 0, i1 %382) #3, !dbg !46 %473 = bitcast i32 %472 to float, !dbg !46 %474 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %453, i1 %383, i32 0, i1 %383) #3, !dbg !46 %475 = bitcast i32 %474 to float, !dbg !46 %476 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %454, i1 %382, i32 0, i1 %382) #3, !dbg !46 %477 = bitcast i32 %476 to float, !dbg !46 %478 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %455, i1 %383, i32 0, i1 %383) #3, !dbg !46 %479 = bitcast i32 %478 to float, !dbg !46 %480 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %456, i1 %382, i32 0, i1 %382) #3, !dbg !46 %481 = bitcast i32 %480 to float, !dbg !46 %482 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %457, i1 %383, i32 0, i1 %383) #3, !dbg !46 %483 = bitcast i32 %482 to float, !dbg !46 %484 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %458, i1 %382, i32 0, i1 %382) #3, !dbg !46 %485 = bitcast i32 %484 to float, !dbg !46 %486 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %459, i1 %383, i32 0, i1 %383) #3, !dbg !46 %487 = bitcast i32 %486 to float, !dbg !46 %488 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %460, i1 %382, i32 0, i1 %382) #3, !dbg !46 %489 = bitcast i32 %488 to float, !dbg !46 %490 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %461, i1 %383, i32 0, i1 %383) #3, !dbg !46 %491 = bitcast i32 %490 to float, !dbg !46 %492 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %462, i1 %382, i32 0, i1 %382) #3, !dbg !46 %493 = bitcast i32 %492 to float, !dbg !46 %494 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %463, i1 %383, i32 0, i1 %383) #3, !dbg !46 %495 = bitcast i32 %494 to float, !dbg !46 %496 = getelementptr i16, ptr addrspace(1) %5, i64 %384, !dbg !47 %497 = getelementptr i16, ptr addrspace(1) %5, i64 %385, !dbg !47 %498 = getelementptr i16, ptr addrspace(1) %5, i64 %386, !dbg !47 %499 = getelementptr i16, ptr addrspace(1) %5, i64 %387, !dbg !47 %500 = getelementptr i16, ptr addrspace(1) %5, i64 %388, !dbg !47 %501 = getelementptr i16, ptr addrspace(1) %5, i64 %389, !dbg !47 %502 = getelementptr i16, ptr addrspace(1) %5, i64 %390, !dbg !47 %503 = getelementptr i16, ptr addrspace(1) %5, i64 %391, !dbg !47 %504 = getelementptr i16, ptr addrspace(1) %5, i64 %392, !dbg !47 %505 = getelementptr i16, ptr addrspace(1) %5, i64 %393, !dbg !47 %506 = getelementptr i16, ptr addrspace(1) %5, i64 %394, !dbg !47 %507 = getelementptr i16, ptr addrspace(1) %5, i64 %395, !dbg !47 %508 = getelementptr i16, ptr addrspace(1) %5, i64 %396, !dbg !47 %509 = getelementptr i16, ptr addrspace(1) %5, i64 %397, !dbg !47 %510 = getelementptr i16, ptr addrspace(1) %5, i64 %398, !dbg !47 %511 = getelementptr i16, ptr addrspace(1) %5, i64 %399, !dbg !47 %512 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %496, i1 %382, i16 0, i1 %382) #3, !dbg !48 %513 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %497, i1 %383, i16 0, i1 %383) #3, !dbg !48 %514 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %498, i1 %382, i16 0, i1 %382) #3, !dbg !48 %515 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %499, i1 %383, i16 0, i1 %383) #3, !dbg !48 %516 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %500, i1 %382, i16 0, i1 %382) #3, !dbg !48 %517 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %501, i1 %383, i16 0, i1 %383) #3, !dbg !48 %518 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %502, i1 %382, i16 0, i1 %382) #3, !dbg !48 %519 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %503, i1 %383, i16 0, i1 %383) #3, !dbg !48 %520 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %504, i1 %382, i16 0, i1 %382) #3, !dbg !48 %521 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %505, i1 %383, i16 0, i1 %383) #3, !dbg !48 %522 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %506, i1 %382, i16 0, i1 %382) #3, !dbg !48 %523 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %507, i1 %383, i16 0, i1 %383) #3, !dbg !48 %524 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %508, i1 %382, i16 0, i1 %382) #3, !dbg !48 %525 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %509, i1 %383, i16 0, i1 %383) #3, !dbg !48 %526 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %510, i1 %382, i16 0, i1 %382) #3, !dbg !48 %527 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %511, i1 %383, i16 0, i1 %383) #3, !dbg !48 %528 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %512) #3, !dbg !49 %529 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %513) #3, !dbg !49 %530 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %514) #3, !dbg !49 %531 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %515) #3, !dbg !49 %532 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %516) #3, !dbg !49 %533 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %517) #3, !dbg !49 %534 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %518) #3, !dbg !49 %535 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %519) #3, !dbg !49 %536 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %520) #3, !dbg !49 %537 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %521) #3, !dbg !49 %538 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %522) #3, !dbg !49 %539 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %523) #3, !dbg !49 %540 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %524) #3, !dbg !49 %541 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %525) #3, !dbg !49 %542 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %526) #3, !dbg !49 %543 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %527) #3, !dbg !49 %544 = fmul float %369, %465, !dbg !38 %545 = fmul float %369, %467, !dbg !38 %546 = fmul float %370, %469, !dbg !38 %547 = fmul float %370, %471, !dbg !38 %548 = fmul float %371, %473, !dbg !38 %549 = fmul float %371, %475, !dbg !38 %550 = fmul float %372, %477, !dbg !38 %551 = fmul float %372, %479, !dbg !38 %552 = fmul float %373, %481, !dbg !38 %553 = fmul float %373, %483, !dbg !38 %554 = fmul float %374, %485, !dbg !38 %555 = fmul float %374, %487, !dbg !38 %556 = fmul float %375, %489, !dbg !38 %557 = fmul float %375, %491, !dbg !38 %558 = fmul float %376, %493, !dbg !38 %559 = fmul float %376, %495, !dbg !38 %560 = fmul float %528, 0x3FF7154760000000, !dbg !50 %561 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %560) #3, !dbg !50 %562 = fmul float %529, 0x3FF7154760000000, !dbg !50 %563 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %562) #3, !dbg !50 %564 = fmul float %530, 0x3FF7154760000000, !dbg !50 %565 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %564) #3, !dbg !50 %566 = fmul float %531, 0x3FF7154760000000, !dbg !50 %567 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %566) #3, !dbg !50 %568 = fmul float %532, 0x3FF7154760000000, !dbg !50 %569 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %568) #3, !dbg !50 %570 = fmul float %533, 0x3FF7154760000000, !dbg !50 %571 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %570) #3, !dbg !50 %572 = fmul float %534, 0x3FF7154760000000, !dbg !50 %573 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %572) #3, !dbg !50 %574 = fmul float %535, 0x3FF7154760000000, !dbg !50 %575 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %574) #3, !dbg !50 %576 = fmul float %536, 0x3FF7154760000000, !dbg !50 %577 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %576) #3, !dbg !50 %578 = fmul float %537, 0x3FF7154760000000, !dbg !50 %579 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %578) #3, !dbg !50 %580 = fmul float %538, 0x3FF7154760000000, !dbg !50 %581 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %580) #3, !dbg !50 %582 = fmul float %539, 0x3FF7154760000000, !dbg !50 %583 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %582) #3, !dbg !50 %584 = fmul float %540, 0x3FF7154760000000, !dbg !50 %585 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %584) #3, !dbg !50 %586 = fmul float %541, 0x3FF7154760000000, !dbg !50 %587 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %586) #3, !dbg !50 %588 = fmul float %542, 0x3FF7154760000000, !dbg !50 %589 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %588) #3, !dbg !50 %590 = fmul float %543, 0x3FF7154760000000, !dbg !50 %591 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %590) #3, !dbg !50 %592 = fmul float %359, %561, !dbg !51 %593 = fmul float %359, %563, !dbg !51 %594 = fmul float %360, %565, !dbg !51 %595 = fmul float %360, %567, !dbg !51 %596 = fmul float %361, %569, !dbg !51 %597 = fmul float %361, %571, !dbg !51 %598 = fmul float %362, %573, !dbg !51 %599 = fmul float %362, %575, !dbg !51 %600 = fmul float %363, %577, !dbg !51 %601 = fmul float %363, %579, !dbg !51 %602 = fmul float %364, %581, !dbg !51 %603 = fmul float %364, %583, !dbg !51 %604 = fmul float %365, %585, !dbg !51 %605 = fmul float %365, %587, !dbg !51 %606 = fmul float %366, %589, !dbg !51 %607 = fmul float %366, %591, !dbg !51 %608 = fsub float %544, %592, !dbg !52 %609 = fsub float %545, %593, !dbg !52 %610 = fsub float %546, %594, !dbg !52 %611 = fsub float %547, %595, !dbg !52 %612 = fsub float %548, %596, !dbg !52 %613 = fsub float %549, %597, !dbg !52 %614 = fsub float %550, %598, !dbg !52 %615 = fsub float %551, %599, !dbg !52 %616 = fsub float %552, %600, !dbg !52 %617 = fsub float %553, %601, !dbg !52 %618 = fsub float %554, %602, !dbg !52 %619 = fsub float %555, %603, !dbg !52 %620 = fsub float %556, %604, !dbg !52 %621 = fsub float %557, %605, !dbg !52 %622 = fsub float %558, %606, !dbg !52 %623 = fsub float %559, %607, !dbg !52 %624 = fadd float %432, %608, !dbg !53 %625 = fadd float %433, %609, !dbg !53 %626 = fadd float %434, %610, !dbg !53 %627 = fadd float %435, %611, !dbg !53 %628 = fadd float %436, %612, !dbg !53 %629 = fadd float %437, %613, !dbg !53 %630 = fadd float %438, %614, !dbg !53 %631 = fadd float %439, %615, !dbg !53 %632 = fadd float %440, %616, !dbg !53 %633 = fadd float %441, %617, !dbg !53 %634 = fadd float %442, %618, !dbg !53 %635 = fadd float %443, %619, !dbg !53 %636 = fadd float %444, %620, !dbg !53 %637 = fadd float %445, %621, !dbg !53 %638 = fadd float %446, %622, !dbg !53 %639 = fadd float %447, %623, !dbg !53 %640 = getelementptr i16, ptr addrspace(1) %6, i64 %384, !dbg !54 %641 = getelementptr i16, ptr addrspace(1) %6, i64 %385, !dbg !54 %642 = getelementptr i16, ptr addrspace(1) %6, i64 %386, !dbg !54 %643 = getelementptr i16, ptr addrspace(1) %6, i64 %387, !dbg !54 %644 = getelementptr i16, ptr addrspace(1) %6, i64 %388, !dbg !54 %645 = getelementptr i16, ptr addrspace(1) %6, i64 %389, !dbg !54 %646 = getelementptr i16, ptr addrspace(1) %6, i64 %390, !dbg !54 %647 = getelementptr i16, ptr addrspace(1) %6, i64 %391, !dbg !54 %648 = getelementptr i16, ptr addrspace(1) %6, i64 %392, !dbg !54 %649 = getelementptr i16, ptr addrspace(1) %6, i64 %393, !dbg !54 %650 = getelementptr i16, ptr addrspace(1) %6, i64 %394, !dbg !54 %651 = getelementptr i16, ptr addrspace(1) %6, i64 %395, !dbg !54 %652 = getelementptr i16, ptr addrspace(1) %6, i64 %396, !dbg !54 %653 = getelementptr i16, ptr addrspace(1) %6, i64 %397, !dbg !54 %654 = getelementptr i16, ptr addrspace(1) %6, i64 %398, !dbg !54 %655 = getelementptr i16, ptr addrspace(1) %6, i64 %399, !dbg !54 %656 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %624) #3, !dbg !55 %657 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %625) #3, !dbg !55 %658 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %626) #3, !dbg !55 %659 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %627) #3, !dbg !55 %660 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %628) #3, !dbg !55 %661 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %629) #3, !dbg !55 %662 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %630) #3, !dbg !55 %663 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %631) #3, !dbg !55 %664 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %632) #3, !dbg !55 %665 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %633) #3, !dbg !55 %666 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %634) #3, !dbg !55 %667 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %635) #3, !dbg !55 %668 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %636) #3, !dbg !55 %669 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %637) #3, !dbg !55 %670 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %638) #3, !dbg !55 %671 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %639) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %656, ptr addrspace(1) %640, i1 %382) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %657, ptr addrspace(1) %641, i1 %383) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %658, ptr addrspace(1) %642, i1 %382) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %659, ptr addrspace(1) %643, i1 %383) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %660, ptr addrspace(1) %644, i1 %382) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %661, ptr addrspace(1) %645, i1 %383) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %662, ptr addrspace(1) %646, i1 %382) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %663, ptr addrspace(1) %647, i1 %383) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %664, ptr addrspace(1) %648, i1 %382) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %665, ptr addrspace(1) %649, i1 %383) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %666, ptr addrspace(1) %650, i1 %382) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %667, ptr addrspace(1) %651, i1 %383) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %668, ptr addrspace(1) %652, i1 %382) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %669, ptr addrspace(1) %653, i1 %383) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %670, ptr addrspace(1) %654, i1 %382) #3, !dbg !55 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %671, ptr addrspace(1) %655, i1 %383) #3, !dbg !55 %672 = add nuw nsw i32 %378, 512, !dbg !39 %673 = icmp ult i32 %378, 49745, !dbg !39 br i1 %673, label %377, label %674, !dbg !39 674: ; preds = %377 ret void, !dbg !56 } ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 ; Function Attrs: convergent nocallback nounwind declare void @llvm.nvvm.barrier0() #2 attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } attributes #2 = { convergent nocallback nounwind } attributes #3 = { nounwind } !llvm.module.flags = !{!0} !llvm.dbg.cu = !{!1} !nvvm.annotations = !{!3, !4, !4, !3} !0 = !{i32 2, !"Debug Info Version", i32 3} !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) !2 = !DIFile(filename: "ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py", directory: "/tmp/torchinductor_root/kz") !3 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"kernel", i32 1} !4 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"maxntidx", i32 256} !5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8", linkageName: "triton__0d1d2d3d4d5d6d7de8", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) !6 = !DISubroutineType(cc: DW_CC_normal, types: !7) !7 = !{} !8 = !DILocation(line: 22, column: 44, scope: !5) !9 = !DILocation(line: 24, column: 33, scope: !5) !10 = !DILocation(line: 21, column: 28, scope: !5) !11 = !DILocation(line: 21, column: 34, scope: !5) !12 = !DILocation(line: 21, column: 46, scope: !5) !13 = !DILocation(line: 22, column: 23, scope: !5) !14 = !DILocation(line: 26, column: 30, scope: !5) !15 = !DILocation(line: 26, column: 35, scope: !5) !16 = !DILocation(line: 27, column: 19, scope: !5) !17 = !DILocation(line: 29, column: 19, scope: !5) !18 = !DILocation(line: 36, column: 46, scope: !5) !19 = !DILocation(line: 38, column: 23, scope: !5) !20 = !DILocation(line: 39, column: 22, scope: !5) !21 = !DILocation(line: 41, column: 37, scope: !5) !22 = !DILocation(line: 32, column: 36, scope: !5) !23 = !DILocation(line: 33, column: 27, scope: !5) !24 = !DILocation(line: 34, column: 25, scope: !5) !25 = !DILocation(line: 36, column: 34, scope: !5) !26 = !DILocation(line: 36, column: 52, scope: !5) !27 = !DILocation(line: 42, column: 23, scope: !5) !28 = !DILocation(line: 45, column: 40, scope: !5) !29 = !DILocation(line: 233, column: 15, scope: !30, inlinedAt: !33) !30 = distinct !DILexicalBlockFile(scope: !32, file: !31, discriminator: 0) !31 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") !32 = distinct !DILexicalBlockFile(scope: !5, file: !31, discriminator: 0) !33 = !DILocation(line: 243, column: 36, scope: !30, inlinedAt: !34) !34 = !DILocation(line: 46, column: 27, scope: !30) !35 = !DILocation(line: 243, column: 36, scope: !32, inlinedAt: !36) !36 = !DILocation(line: 46, column: 27, scope: !32) !37 = !DILocation(line: 52, column: 27, scope: !5) !38 = !DILocation(line: 63, column: 24, scope: !5) !39 = !DILocation(line: 51, column: 36, scope: !5) !40 = !DILocation(line: 53, column: 25, scope: !5) !41 = !DILocation(line: 55, column: 41, scope: !5) !42 = !DILocation(line: 55, column: 35, scope: !5) !43 = !DILocation(line: 55, column: 53, scope: !5) !44 = !DILocation(line: 55, column: 105, scope: !5) !45 = !DILocation(line: 56, column: 35, scope: !5) !46 = !DILocation(line: 56, column: 53, scope: !5) !47 = !DILocation(line: 57, column: 35, scope: !5) !48 = !DILocation(line: 57, column: 53, scope: !5) !49 = !DILocation(line: 57, column: 105, scope: !5) !50 = !DILocation(line: 65, column: 23, scope: !5) !51 = !DILocation(line: 66, column: 24, scope: !5) !52 = !DILocation(line: 67, column: 24, scope: !5) !53 = !DILocation(line: 69, column: 24, scope: !5) !54 = !DILocation(line: 70, column: 29, scope: !5) !55 = !DILocation(line: 70, column: 54, scope: !5) !56 = !DILocation(line: 51, column: 4, scope: !5)